Print this page
3769 Implement SOCK_NONBLOCK flag to socket()
Reviewed-by: Robert Mustacchi <rm@joyent.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/sockfs/socksyscalls.c
+++ new/usr/src/uts/common/fs/sockfs/socksyscalls.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 */
25 25
26 26 /* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */
27 27
28 28 #include <sys/types.h>
29 29 #include <sys/t_lock.h>
30 30 #include <sys/param.h>
31 31 #include <sys/systm.h>
32 32 #include <sys/buf.h>
33 33 #include <sys/conf.h>
34 34 #include <sys/cred.h>
35 35 #include <sys/kmem.h>
36 36 #include <sys/sysmacros.h>
37 37 #include <sys/vfs.h>
38 38 #include <sys/vnode.h>
39 39 #include <sys/debug.h>
40 40 #include <sys/errno.h>
41 41 #include <sys/time.h>
42 42 #include <sys/file.h>
43 43 #include <sys/user.h>
44 44 #include <sys/stream.h>
45 45 #include <sys/strsubr.h>
46 46 #include <sys/strsun.h>
47 47 #include <sys/sunddi.h>
48 48 #include <sys/esunddi.h>
49 49 #include <sys/flock.h>
50 50 #include <sys/modctl.h>
51 51 #include <sys/cmn_err.h>
52 52 #include <sys/vmsystm.h>
53 53 #include <sys/policy.h>
54 54
55 55 #include <sys/socket.h>
56 56 #include <sys/socketvar.h>
57 57
58 58 #include <sys/isa_defs.h>
59 59 #include <sys/inttypes.h>
60 60 #include <sys/systm.h>
61 61 #include <sys/cpuvar.h>
62 62 #include <sys/filio.h>
63 63 #include <sys/sendfile.h>
64 64 #include <sys/ddi.h>
65 65 #include <vm/seg.h>
66 66 #include <vm/seg_map.h>
67 67 #include <vm/seg_kpm.h>
68 68
69 69 #include <fs/sockfs/nl7c.h>
70 70 #include <fs/sockfs/sockcommon.h>
71 71 #include <fs/sockfs/sockfilter_impl.h>
72 72 #include <fs/sockfs/socktpi.h>
73 73
74 74 #ifdef SOCK_TEST
75 75 int do_useracc = 1; /* Controlled by setting SO_DEBUG to 4 */
76 76 #else
77 77 #define do_useracc 1
78 78 #endif /* SOCK_TEST */
79 79
80 80 extern int xnet_truncate_print;
81 81
82 82 extern void nl7c_init(void);
83 83 extern int sockfs_defer_nl7c_init;
84 84
85 85 /*
86 86 * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c"
87 87 * as there isn't a formal definition of IOV_MAX ???
88 88 */
89 89 #define MSG_MAXIOVLEN 16
90 90
91 91 /*
92 92 * Kernel component of socket creation.
93 93 *
94 94 * The socket library determines which version number to use.
95 95 * First the library calls this with a NULL devpath. If this fails
96 96 * to find a transport (using solookup) the library will look in /etc/netconfig
97 97 * for the appropriate transport. If one is found it will pass in the
98 98 * devpath for the kernel to use.
99 99 */
100 100 int
101 101 so_socket(int family, int type_w_flags, int protocol, char *devpath,
↓ open down ↓ |
101 lines elided |
↑ open up ↑ |
102 102 int version)
103 103 {
104 104 struct sonode *so;
105 105 vnode_t *vp;
106 106 struct file *fp;
107 107 int fd;
108 108 int error;
109 109 int type;
110 110
111 111 type = type_w_flags & SOCK_TYPE_MASK;
112 + type_w_flags &= ~SOCK_TYPE_MASK;
113 + if (type_w_flags & ~(SOCK_CLOEXEC|SOCK_NDELAY|SOCK_NONBLOCK))
114 + return (set_errno(EINVAL));
115 +
112 116 if (devpath != NULL) {
113 117 char *buf;
114 118 size_t kdevpathlen = 0;
115 119
116 120 buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
117 121 if ((error = copyinstr(devpath, buf,
118 122 MAXPATHLEN, &kdevpathlen)) != 0) {
119 123 kmem_free(buf, MAXPATHLEN);
120 124 return (set_errno(error));
121 125 }
122 126 so = socket_create(family, type, protocol, buf, NULL,
123 127 SOCKET_SLEEP, version, CRED(), &error);
124 128 kmem_free(buf, MAXPATHLEN);
125 129 } else {
126 130 so = socket_create(family, type, protocol, NULL, NULL,
127 131 SOCKET_SLEEP, version, CRED(), &error);
128 132 }
129 133 if (so == NULL)
130 134 return (set_errno(error));
131 135
132 136 /* Allocate a file descriptor for the socket */
↓ open down ↓ |
11 lines elided |
↑ open up ↑ |
133 137 vp = SOTOV(so);
134 138 if (error = falloc(vp, FWRITE|FREAD, &fp, &fd)) {
135 139 (void) socket_close(so, 0, CRED());
136 140 socket_destroy(so);
137 141 return (set_errno(error));
138 142 }
139 143
140 144 /*
141 145 * Now fill in the entries that falloc reserved
142 146 */
147 + if (type_w_flags & SOCK_NDELAY) {
148 + so->so_state |= SS_NDELAY;
149 + fp->f_flag |= FNDELAY;
150 + }
151 + if (type_w_flags & SOCK_NONBLOCK) {
152 + so->so_state |= SS_NONBLOCK;
153 + fp->f_flag |= FNONBLOCK;
154 + }
143 155 mutex_exit(&fp->f_tlock);
144 156 setf(fd, fp);
145 157 if ((type_w_flags & SOCK_CLOEXEC) != 0) {
146 158 f_setfd(fd, FD_CLOEXEC);
147 159 }
148 160
149 161 return (fd);
150 162 }
151 163
152 164 /*
153 165 * Map from a file descriptor to a socket node.
154 166 * Returns with the file descriptor held i.e. the caller has to
155 167 * use releasef when done with the file descriptor.
156 168 */
157 169 struct sonode *
158 170 getsonode(int sock, int *errorp, file_t **fpp)
159 171 {
160 172 file_t *fp;
161 173 vnode_t *vp;
162 174 struct sonode *so;
163 175
164 176 if ((fp = getf(sock)) == NULL) {
165 177 *errorp = EBADF;
166 178 eprintline(*errorp);
167 179 return (NULL);
168 180 }
169 181 vp = fp->f_vnode;
170 182 /* Check if it is a socket */
171 183 if (vp->v_type != VSOCK) {
172 184 releasef(sock);
173 185 *errorp = ENOTSOCK;
174 186 eprintline(*errorp);
175 187 return (NULL);
176 188 }
177 189 /*
178 190 * Use the stream head to find the real socket vnode.
179 191 * This is needed when namefs sits above sockfs.
180 192 */
181 193 if (vp->v_stream) {
182 194 ASSERT(vp->v_stream->sd_vnode);
183 195 vp = vp->v_stream->sd_vnode;
184 196
185 197 so = VTOSO(vp);
186 198 if (so->so_version == SOV_STREAM) {
187 199 releasef(sock);
188 200 *errorp = ENOTSOCK;
189 201 eprintsoline(so, *errorp);
190 202 return (NULL);
191 203 }
192 204 } else {
193 205 so = VTOSO(vp);
194 206 }
195 207 if (fpp)
196 208 *fpp = fp;
197 209 return (so);
198 210 }
199 211
200 212 /*
201 213 * Allocate and copyin a sockaddr.
202 214 * Ensures NULL termination for AF_UNIX addresses by extending them
203 215 * with one NULL byte if need be. Verifies that the length is not
204 216 * excessive to prevent an application from consuming all of kernel
205 217 * memory. Returns NULL when an error occurred.
206 218 */
207 219 static struct sockaddr *
208 220 copyin_name(struct sonode *so, struct sockaddr *name, socklen_t *namelenp,
209 221 int *errorp)
210 222 {
211 223 char *faddr;
212 224 size_t namelen = (size_t)*namelenp;
213 225
214 226 ASSERT(namelen != 0);
215 227 if (namelen > SO_MAXARGSIZE) {
216 228 *errorp = EINVAL;
217 229 eprintsoline(so, *errorp);
218 230 return (NULL);
219 231 }
220 232
221 233 faddr = (char *)kmem_alloc(namelen, KM_SLEEP);
222 234 if (copyin(name, faddr, namelen)) {
223 235 kmem_free(faddr, namelen);
224 236 *errorp = EFAULT;
225 237 eprintsoline(so, *errorp);
226 238 return (NULL);
227 239 }
228 240
229 241 /*
230 242 * Add space for NULL termination if needed.
231 243 * Do a quick check if the last byte is NUL.
232 244 */
233 245 if (so->so_family == AF_UNIX && faddr[namelen - 1] != '\0') {
234 246 /* Check if there is any NULL termination */
235 247 size_t i;
236 248 int foundnull = 0;
237 249
238 250 for (i = sizeof (name->sa_family); i < namelen; i++) {
239 251 if (faddr[i] == '\0') {
240 252 foundnull = 1;
241 253 break;
242 254 }
243 255 }
244 256 if (!foundnull) {
245 257 /* Add extra byte for NUL padding */
246 258 char *nfaddr;
247 259
248 260 nfaddr = (char *)kmem_alloc(namelen + 1, KM_SLEEP);
249 261 bcopy(faddr, nfaddr, namelen);
250 262 kmem_free(faddr, namelen);
251 263
252 264 /* NUL terminate */
253 265 nfaddr[namelen] = '\0';
254 266 namelen++;
255 267 ASSERT((socklen_t)namelen == namelen);
256 268 *namelenp = (socklen_t)namelen;
257 269 faddr = nfaddr;
258 270 }
259 271 }
260 272 return ((struct sockaddr *)faddr);
261 273 }
262 274
263 275 /*
264 276 * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
265 277 */
266 278 static int
267 279 copyout_arg(void *uaddr, socklen_t ulen, void *ulenp,
268 280 void *kaddr, socklen_t klen)
269 281 {
270 282 if (uaddr != NULL) {
271 283 if (ulen > klen)
272 284 ulen = klen;
273 285
274 286 if (ulen != 0) {
275 287 if (copyout(kaddr, uaddr, ulen))
276 288 return (EFAULT);
277 289 }
278 290 } else
279 291 ulen = 0;
280 292
281 293 if (ulenp != NULL) {
282 294 if (copyout(&ulen, ulenp, sizeof (ulen)))
283 295 return (EFAULT);
284 296 }
285 297 return (0);
286 298 }
287 299
288 300 /*
289 301 * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
290 302 * If klen is greater than ulen it still uses the non-truncated
291 303 * klen to update ulenp.
292 304 */
293 305 static int
294 306 copyout_name(void *uaddr, socklen_t ulen, void *ulenp,
295 307 void *kaddr, socklen_t klen)
296 308 {
297 309 if (uaddr != NULL) {
298 310 if (ulen >= klen)
299 311 ulen = klen;
300 312 else if (ulen != 0 && xnet_truncate_print) {
301 313 printf("sockfs: truncating copyout of address using "
302 314 "XNET semantics for pid = %d. Lengths %d, %d\n",
303 315 curproc->p_pid, klen, ulen);
304 316 }
305 317
306 318 if (ulen != 0) {
307 319 if (copyout(kaddr, uaddr, ulen))
308 320 return (EFAULT);
309 321 } else
310 322 klen = 0;
311 323 } else
312 324 klen = 0;
313 325
314 326 if (ulenp != NULL) {
315 327 if (copyout(&klen, ulenp, sizeof (klen)))
316 328 return (EFAULT);
317 329 }
318 330 return (0);
319 331 }
320 332
321 333 /*
322 334 * The socketpair() code in libsocket creates two sockets (using
323 335 * the /etc/netconfig fallback if needed) before calling this routine
324 336 * to connect the two sockets together.
325 337 *
326 338 * For a SOCK_STREAM socketpair a listener is needed - in that case this
327 339 * routine will create a new file descriptor as part of accepting the
328 340 * connection. The library socketpair() will check if svs[2] has changed
329 341 * in which case it will close the changed fd.
330 342 *
331 343 * Note that this code could use the TPI feature of accepting the connection
332 344 * on the listening endpoint. However, that would require significant changes
333 345 * to soaccept.
334 346 */
335 347 int
336 348 so_socketpair(int sv[2])
337 349 {
338 350 int svs[2];
339 351 struct sonode *so1, *so2;
340 352 int error;
341 353 int orig_flags;
342 354 struct sockaddr_ux *name;
343 355 size_t namelen;
344 356 sotpi_info_t *sti1;
345 357 sotpi_info_t *sti2;
346 358
347 359 dprint(1, ("so_socketpair(%p)\n", (void *)sv));
348 360
349 361 error = useracc(sv, sizeof (svs), B_WRITE);
350 362 if (error && do_useracc)
351 363 return (set_errno(EFAULT));
352 364
353 365 if (copyin(sv, svs, sizeof (svs)))
354 366 return (set_errno(EFAULT));
355 367
356 368 if ((so1 = getsonode(svs[0], &error, NULL)) == NULL)
357 369 return (set_errno(error));
358 370
359 371 if ((so2 = getsonode(svs[1], &error, NULL)) == NULL) {
360 372 releasef(svs[0]);
361 373 return (set_errno(error));
362 374 }
363 375
364 376 if (so1->so_family != AF_UNIX || so2->so_family != AF_UNIX) {
365 377 error = EOPNOTSUPP;
366 378 goto done;
367 379 }
368 380
369 381 sti1 = SOTOTPI(so1);
370 382 sti2 = SOTOTPI(so2);
371 383
372 384 /*
373 385 * The code below makes assumptions about the "sockfs" implementation.
374 386 * So make sure that the correct implementation is really used.
375 387 */
376 388 ASSERT(so1->so_ops == &sotpi_sonodeops);
377 389 ASSERT(so2->so_ops == &sotpi_sonodeops);
378 390
379 391 if (so1->so_type == SOCK_DGRAM) {
380 392 /*
381 393 * Bind both sockets and connect them with each other.
382 394 * Need to allocate name/namelen for soconnect.
383 395 */
384 396 error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC, CRED());
385 397 if (error) {
386 398 eprintsoline(so1, error);
387 399 goto done;
388 400 }
389 401 error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED());
390 402 if (error) {
391 403 eprintsoline(so2, error);
392 404 goto done;
393 405 }
394 406 namelen = sizeof (struct sockaddr_ux);
395 407 name = kmem_alloc(namelen, KM_SLEEP);
396 408 name->sou_family = AF_UNIX;
397 409 name->sou_addr = sti2->sti_ux_laddr;
398 410 error = socket_connect(so1,
399 411 (struct sockaddr *)name,
400 412 (socklen_t)namelen,
401 413 0, _SOCONNECT_NOXLATE, CRED());
402 414 if (error) {
403 415 kmem_free(name, namelen);
404 416 eprintsoline(so1, error);
405 417 goto done;
406 418 }
407 419 name->sou_addr = sti1->sti_ux_laddr;
408 420 error = socket_connect(so2,
409 421 (struct sockaddr *)name,
410 422 (socklen_t)namelen,
411 423 0, _SOCONNECT_NOXLATE, CRED());
412 424 kmem_free(name, namelen);
413 425 if (error) {
414 426 eprintsoline(so2, error);
415 427 goto done;
416 428 }
417 429 releasef(svs[0]);
418 430 releasef(svs[1]);
419 431 } else {
420 432 /*
421 433 * Bind both sockets, with so1 being a listener.
422 434 * Connect so2 to so1 - nonblocking to avoid waiting for
423 435 * soaccept to complete.
424 436 * Accept a connection on so1. Pass out the new fd as sv[0].
425 437 * The library will detect the changed fd and close
426 438 * the original one.
427 439 */
428 440 struct sonode *nso;
429 441 struct vnode *nvp;
430 442 struct file *nfp;
431 443 int nfd;
432 444
433 445 /*
434 446 * We could simply call socket_listen() here (which would do the
435 447 * binding automatically) if the code didn't rely on passing
436 448 * _SOBIND_NOXLATE to the TPI implementation of socket_bind().
437 449 */
438 450 error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC|
439 451 _SOBIND_NOXLATE|_SOBIND_LISTEN|_SOBIND_SOCKETPAIR,
440 452 CRED());
441 453 if (error) {
442 454 eprintsoline(so1, error);
443 455 goto done;
444 456 }
445 457 error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED());
446 458 if (error) {
447 459 eprintsoline(so2, error);
448 460 goto done;
449 461 }
450 462
451 463 namelen = sizeof (struct sockaddr_ux);
452 464 name = kmem_alloc(namelen, KM_SLEEP);
453 465 name->sou_family = AF_UNIX;
454 466 name->sou_addr = sti1->sti_ux_laddr;
455 467 error = socket_connect(so2,
456 468 (struct sockaddr *)name,
457 469 (socklen_t)namelen,
458 470 FNONBLOCK, _SOCONNECT_NOXLATE, CRED());
459 471 kmem_free(name, namelen);
460 472 if (error) {
461 473 if (error != EINPROGRESS) {
462 474 eprintsoline(so2, error); goto done;
463 475 }
464 476 }
465 477
466 478 error = socket_accept(so1, 0, CRED(), &nso);
467 479 if (error) {
468 480 eprintsoline(so1, error);
469 481 goto done;
470 482 }
471 483
472 484 /* wait for so2 being SS_CONNECTED ignoring signals */
473 485 mutex_enter(&so2->so_lock);
474 486 error = sowaitconnected(so2, 0, 1);
475 487 mutex_exit(&so2->so_lock);
476 488 if (error != 0) {
477 489 (void) socket_close(nso, 0, CRED());
478 490 socket_destroy(nso);
479 491 eprintsoline(so2, error);
480 492 goto done;
↓ open down ↓ |
328 lines elided |
↑ open up ↑ |
481 493 }
482 494
483 495 nvp = SOTOV(nso);
484 496 if (error = falloc(nvp, FWRITE|FREAD, &nfp, &nfd)) {
485 497 (void) socket_close(nso, 0, CRED());
486 498 socket_destroy(nso);
487 499 eprintsoline(nso, error);
488 500 goto done;
489 501 }
490 502 /*
503 + * copy over FNONBLOCK and FNDELAY flags should they exist
504 + */
505 + if (so1->so_state & SS_NONBLOCK)
506 + nfp->f_flag |= FNONBLOCK;
507 + if (so1->so_state & SS_NDELAY)
508 + nfp->f_flag |= FNDELAY;
509 +
510 + /*
491 511 * fill in the entries that falloc reserved
492 512 */
493 513 mutex_exit(&nfp->f_tlock);
494 514 setf(nfd, nfp);
495 515
516 + /*
517 + * get the original flags before we release
518 + */
519 + VERIFY(f_getfd_error(svs[0], &orig_flags) == 0);
520 +
496 521 releasef(svs[0]);
497 522 releasef(svs[1]);
498 523
499 524 /*
500 525 * If FD_CLOEXEC was set on the filedescriptor we're
501 526 * swapping out, we should set it on the new one too.
502 527 */
503 - VERIFY(f_getfd_error(svs[0], &orig_flags) == 0);
504 528 if (orig_flags & FD_CLOEXEC) {
505 529 f_setfd(nfd, FD_CLOEXEC);
506 530 }
507 531
508 532 /*
509 533 * The socketpair library routine will close the original
510 534 * svs[0] when this code passes out a different file
511 535 * descriptor.
512 536 */
513 537 svs[0] = nfd;
514 538
515 539 if (copyout(svs, sv, sizeof (svs))) {
516 540 (void) closeandsetf(nfd, NULL);
517 541 eprintline(EFAULT);
518 542 return (set_errno(EFAULT));
519 543 }
520 544 }
521 545 return (0);
522 546
523 547 done:
524 548 releasef(svs[0]);
525 549 releasef(svs[1]);
526 550 return (set_errno(error));
527 551 }
528 552
529 553 int
530 554 bind(int sock, struct sockaddr *name, socklen_t namelen, int version)
531 555 {
532 556 struct sonode *so;
533 557 int error;
534 558
535 559 dprint(1, ("bind(%d, %p, %d)\n",
536 560 sock, (void *)name, namelen));
537 561
538 562 if ((so = getsonode(sock, &error, NULL)) == NULL)
539 563 return (set_errno(error));
540 564
541 565 /* Allocate and copyin name */
542 566 /*
543 567 * X/Open test does not expect EFAULT with NULL name and non-zero
544 568 * namelen.
545 569 */
546 570 if (name != NULL && namelen != 0) {
547 571 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
548 572 name = copyin_name(so, name, &namelen, &error);
549 573 if (name == NULL) {
550 574 releasef(sock);
551 575 return (set_errno(error));
552 576 }
553 577 } else {
554 578 name = NULL;
555 579 namelen = 0;
556 580 }
557 581
558 582 switch (version) {
559 583 default:
560 584 error = socket_bind(so, name, namelen, 0, CRED());
561 585 break;
562 586 case SOV_XPG4_2:
563 587 error = socket_bind(so, name, namelen, _SOBIND_XPG4_2, CRED());
564 588 break;
565 589 case SOV_SOCKBSD:
566 590 error = socket_bind(so, name, namelen, _SOBIND_SOCKBSD, CRED());
567 591 break;
568 592 }
569 593 done:
570 594 releasef(sock);
571 595 if (name != NULL)
572 596 kmem_free(name, (size_t)namelen);
573 597
574 598 if (error)
575 599 return (set_errno(error));
576 600 return (0);
577 601 }
578 602
579 603 /* ARGSUSED2 */
580 604 int
581 605 listen(int sock, int backlog, int version)
582 606 {
583 607 struct sonode *so;
584 608 int error;
585 609
586 610 dprint(1, ("listen(%d, %d)\n",
587 611 sock, backlog));
588 612
589 613 if ((so = getsonode(sock, &error, NULL)) == NULL)
590 614 return (set_errno(error));
591 615
592 616 error = socket_listen(so, backlog, CRED());
593 617
594 618 releasef(sock);
595 619 if (error)
596 620 return (set_errno(error));
597 621 return (0);
598 622 }
599 623
600 624 /*ARGSUSED3*/
601 625 int
602 626 accept(int sock, struct sockaddr *name, socklen_t *namelenp, int version,
603 627 int flags)
604 628 {
605 629 struct sonode *so;
606 630 file_t *fp;
607 631 int error;
608 632 socklen_t namelen;
609 633 struct sonode *nso;
610 634 struct vnode *nvp;
611 635 struct file *nfp;
612 636 int nfd;
613 637 int ssflags;
614 638 struct sockaddr *addrp;
615 639 socklen_t addrlen;
616 640
617 641 dprint(1, ("accept(%d, %p, %p)\n",
618 642 sock, (void *)name, (void *)namelenp));
619 643
620 644 if (flags & ~(SOCK_CLOEXEC|SOCK_NONBLOCK|SOCK_NDELAY)) {
621 645 return (set_errno(EINVAL));
622 646 }
623 647
624 648 /* Translate SOCK_ flags to their SS_ variant */
625 649 ssflags = 0;
626 650 if (flags & SOCK_NONBLOCK)
627 651 ssflags |= SS_NONBLOCK;
628 652 if (flags & SOCK_NDELAY)
629 653 ssflags |= SS_NDELAY;
630 654
631 655 if ((so = getsonode(sock, &error, &fp)) == NULL)
632 656 return (set_errno(error));
633 657
634 658 if (name != NULL) {
635 659 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
636 660 if (copyin(namelenp, &namelen, sizeof (namelen))) {
637 661 releasef(sock);
638 662 return (set_errno(EFAULT));
639 663 }
640 664 if (namelen != 0) {
641 665 error = useracc(name, (size_t)namelen, B_WRITE);
642 666 if (error && do_useracc) {
643 667 releasef(sock);
644 668 return (set_errno(EFAULT));
645 669 }
646 670 } else
647 671 name = NULL;
648 672 } else {
649 673 namelen = 0;
650 674 }
651 675
652 676 /*
653 677 * Allocate the user fd before socket_accept() in order to
654 678 * catch EMFILE errors before calling socket_accept().
655 679 */
656 680 if ((nfd = ufalloc(0)) == -1) {
657 681 eprintsoline(so, EMFILE);
658 682 releasef(sock);
659 683 return (set_errno(EMFILE));
660 684 }
661 685 error = socket_accept(so, fp->f_flag, CRED(), &nso);
662 686 if (error) {
663 687 setf(nfd, NULL);
664 688 releasef(sock);
665 689 return (set_errno(error));
666 690 }
667 691
668 692 nvp = SOTOV(nso);
669 693
670 694 ASSERT(MUTEX_NOT_HELD(&nso->so_lock));
671 695 if (namelen != 0) {
672 696 addrlen = so->so_max_addr_len;
673 697 addrp = (struct sockaddr *)kmem_alloc(addrlen, KM_SLEEP);
674 698
675 699 if ((error = socket_getpeername(nso, (struct sockaddr *)addrp,
676 700 &addrlen, B_TRUE, CRED())) == 0) {
677 701 error = copyout_name(name, namelen, namelenp,
678 702 addrp, addrlen);
679 703 } else {
680 704 ASSERT(error == EINVAL || error == ENOTCONN);
681 705 error = ECONNABORTED;
682 706 }
683 707 kmem_free(addrp, so->so_max_addr_len);
684 708 }
685 709
686 710 if (error) {
687 711 setf(nfd, NULL);
688 712 (void) socket_close(nso, 0, CRED());
689 713 socket_destroy(nso);
690 714 releasef(sock);
691 715 return (set_errno(error));
692 716 }
693 717 if (error = falloc(NULL, FWRITE|FREAD, &nfp, NULL)) {
694 718 setf(nfd, NULL);
695 719 (void) socket_close(nso, 0, CRED());
696 720 socket_destroy(nso);
697 721 eprintsoline(so, error);
698 722 releasef(sock);
699 723 return (set_errno(error));
700 724 }
701 725 /*
702 726 * fill in the entries that falloc reserved
703 727 */
704 728 nfp->f_vnode = nvp;
705 729 mutex_exit(&nfp->f_tlock);
706 730 setf(nfd, nfp);
707 731
708 732 /*
709 733 * Act on SOCK_CLOEXEC from flags
710 734 */
711 735 if (flags & SOCK_CLOEXEC) {
712 736 f_setfd(nfd, FD_CLOEXEC);
713 737 }
714 738
715 739 /*
716 740 * Copy FNDELAY and FNONBLOCK from listener to acceptor
717 741 * and from ssflags
718 742 */
719 743 if ((ssflags | so->so_state) & (SS_NDELAY|SS_NONBLOCK)) {
720 744 uint_t oflag = nfp->f_flag;
721 745 int arg = 0;
722 746
723 747 if ((ssflags | so->so_state) & SS_NONBLOCK)
724 748 arg |= FNONBLOCK;
725 749 else if ((ssflags | so->so_state) & SS_NDELAY)
726 750 arg |= FNDELAY;
727 751
728 752 /*
729 753 * This code is a simplification of the F_SETFL code in fcntl()
730 754 * Ignore any errors from VOP_SETFL.
731 755 */
732 756 if ((error = VOP_SETFL(nvp, oflag, arg, nfp->f_cred, NULL))
733 757 != 0) {
734 758 eprintsoline(so, error);
735 759 error = 0;
736 760 } else {
737 761 mutex_enter(&nfp->f_tlock);
738 762 nfp->f_flag &= ~FMASK | (FREAD|FWRITE);
739 763 nfp->f_flag |= arg;
740 764 mutex_exit(&nfp->f_tlock);
741 765 }
742 766 }
743 767 releasef(sock);
744 768 return (nfd);
745 769 }
746 770
747 771 int
748 772 connect(int sock, struct sockaddr *name, socklen_t namelen, int version)
749 773 {
750 774 struct sonode *so;
751 775 file_t *fp;
752 776 int error;
753 777
754 778 dprint(1, ("connect(%d, %p, %d)\n",
755 779 sock, (void *)name, namelen));
756 780
757 781 if ((so = getsonode(sock, &error, &fp)) == NULL)
758 782 return (set_errno(error));
759 783
760 784 /* Allocate and copyin name */
761 785 if (namelen != 0) {
762 786 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
763 787 name = copyin_name(so, name, &namelen, &error);
764 788 if (name == NULL) {
765 789 releasef(sock);
766 790 return (set_errno(error));
767 791 }
768 792 } else
769 793 name = NULL;
770 794
771 795 error = socket_connect(so, name, namelen, fp->f_flag,
772 796 (version != SOV_XPG4_2) ? 0 : _SOCONNECT_XPG4_2, CRED());
773 797 releasef(sock);
774 798 if (name)
775 799 kmem_free(name, (size_t)namelen);
776 800 if (error)
777 801 return (set_errno(error));
778 802 return (0);
779 803 }
780 804
781 805 /*ARGSUSED2*/
782 806 int
783 807 shutdown(int sock, int how, int version)
784 808 {
785 809 struct sonode *so;
786 810 int error;
787 811
788 812 dprint(1, ("shutdown(%d, %d)\n",
789 813 sock, how));
790 814
791 815 if ((so = getsonode(sock, &error, NULL)) == NULL)
792 816 return (set_errno(error));
793 817
794 818 error = socket_shutdown(so, how, CRED());
795 819
796 820 releasef(sock);
797 821 if (error)
798 822 return (set_errno(error));
799 823 return (0);
800 824 }
801 825
802 826 /*
803 827 * Common receive routine.
804 828 */
805 829 static ssize_t
806 830 recvit(int sock,
807 831 struct nmsghdr *msg,
808 832 struct uio *uiop,
809 833 int flags,
810 834 socklen_t *namelenp,
811 835 socklen_t *controllenp,
812 836 int *flagsp)
813 837 {
814 838 struct sonode *so;
815 839 file_t *fp;
816 840 void *name;
817 841 socklen_t namelen;
818 842 void *control;
819 843 socklen_t controllen;
820 844 ssize_t len;
821 845 int error;
822 846
823 847 if ((so = getsonode(sock, &error, &fp)) == NULL)
824 848 return (set_errno(error));
825 849
826 850 len = uiop->uio_resid;
827 851 uiop->uio_fmode = fp->f_flag;
828 852 uiop->uio_extflg = UIO_COPY_CACHED;
829 853
830 854 name = msg->msg_name;
831 855 namelen = msg->msg_namelen;
832 856 control = msg->msg_control;
833 857 controllen = msg->msg_controllen;
834 858
835 859 msg->msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL |
836 860 MSG_DONTWAIT | MSG_XPG4_2);
837 861
838 862 error = socket_recvmsg(so, msg, uiop, CRED());
839 863 if (error) {
840 864 releasef(sock);
841 865 return (set_errno(error));
842 866 }
843 867 lwp_stat_update(LWP_STAT_MSGRCV, 1);
844 868 releasef(sock);
845 869
846 870 error = copyout_name(name, namelen, namelenp,
847 871 msg->msg_name, msg->msg_namelen);
848 872 if (error)
849 873 goto err;
850 874
851 875 if (flagsp != NULL) {
852 876 /*
853 877 * Clear internal flag.
854 878 */
855 879 msg->msg_flags &= ~MSG_XPG4_2;
856 880
857 881 /*
858 882 * Determine MSG_CTRUNC. sorecvmsg sets MSG_CTRUNC only
859 883 * when controllen is zero and there is control data to
860 884 * copy out.
861 885 */
862 886 if (controllen != 0 &&
863 887 (msg->msg_controllen > controllen || control == NULL)) {
864 888 dprint(1, ("recvit: CTRUNC %d %d %p\n",
865 889 msg->msg_controllen, controllen, control));
866 890
867 891 msg->msg_flags |= MSG_CTRUNC;
868 892 }
869 893 if (copyout(&msg->msg_flags, flagsp,
870 894 sizeof (msg->msg_flags))) {
871 895 error = EFAULT;
872 896 goto err;
873 897 }
874 898 }
875 899 /*
876 900 * Note: This MUST be done last. There can be no "goto err" after this
877 901 * point since it could make so_closefds run twice on some part
878 902 * of the file descriptor array.
879 903 */
880 904 if (controllen != 0) {
881 905 if (!(flags & MSG_XPG4_2)) {
882 906 /*
883 907 * Good old msg_accrights can only return a multiple
884 908 * of 4 bytes.
885 909 */
886 910 controllen &= ~((int)sizeof (uint32_t) - 1);
887 911 }
888 912 error = copyout_arg(control, controllen, controllenp,
889 913 msg->msg_control, msg->msg_controllen);
890 914 if (error)
891 915 goto err;
892 916
893 917 if (msg->msg_controllen > controllen || control == NULL) {
894 918 if (control == NULL)
895 919 controllen = 0;
896 920 so_closefds(msg->msg_control, msg->msg_controllen,
897 921 !(flags & MSG_XPG4_2), controllen);
898 922 }
899 923 }
900 924 if (msg->msg_namelen != 0)
901 925 kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
902 926 if (msg->msg_controllen != 0)
903 927 kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
904 928 return (len - uiop->uio_resid);
905 929
906 930 err:
907 931 /*
908 932 * If we fail and the control part contains file descriptors
909 933 * we have to close the fd's.
910 934 */
911 935 if (msg->msg_controllen != 0)
912 936 so_closefds(msg->msg_control, msg->msg_controllen,
913 937 !(flags & MSG_XPG4_2), 0);
914 938 if (msg->msg_namelen != 0)
915 939 kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
916 940 if (msg->msg_controllen != 0)
917 941 kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
918 942 return (set_errno(error));
919 943 }
920 944
921 945 /*
922 946 * Native system call
923 947 */
924 948 ssize_t
925 949 recv(int sock, void *buffer, size_t len, int flags)
926 950 {
927 951 struct nmsghdr lmsg;
928 952 struct uio auio;
929 953 struct iovec aiov[1];
930 954
931 955 dprint(1, ("recv(%d, %p, %ld, %d)\n",
932 956 sock, buffer, len, flags));
933 957
934 958 if ((ssize_t)len < 0) {
935 959 return (set_errno(EINVAL));
936 960 }
937 961
938 962 aiov[0].iov_base = buffer;
939 963 aiov[0].iov_len = len;
940 964 auio.uio_loffset = 0;
941 965 auio.uio_iov = aiov;
942 966 auio.uio_iovcnt = 1;
943 967 auio.uio_resid = len;
944 968 auio.uio_segflg = UIO_USERSPACE;
945 969 auio.uio_limit = 0;
946 970
947 971 lmsg.msg_namelen = 0;
948 972 lmsg.msg_controllen = 0;
949 973 lmsg.msg_flags = 0;
950 974 return (recvit(sock, &lmsg, &auio, flags, NULL, NULL, NULL));
951 975 }
952 976
953 977 ssize_t
954 978 recvfrom(int sock, void *buffer, size_t len, int flags,
955 979 struct sockaddr *name, socklen_t *namelenp)
956 980 {
957 981 struct nmsghdr lmsg;
958 982 struct uio auio;
959 983 struct iovec aiov[1];
960 984
961 985 dprint(1, ("recvfrom(%d, %p, %ld, %d, %p, %p)\n",
962 986 sock, buffer, len, flags, (void *)name, (void *)namelenp));
963 987
964 988 if ((ssize_t)len < 0) {
965 989 return (set_errno(EINVAL));
966 990 }
967 991
968 992 aiov[0].iov_base = buffer;
969 993 aiov[0].iov_len = len;
970 994 auio.uio_loffset = 0;
971 995 auio.uio_iov = aiov;
972 996 auio.uio_iovcnt = 1;
973 997 auio.uio_resid = len;
974 998 auio.uio_segflg = UIO_USERSPACE;
975 999 auio.uio_limit = 0;
976 1000
977 1001 lmsg.msg_name = (char *)name;
978 1002 if (namelenp != NULL) {
979 1003 if (copyin(namelenp, &lmsg.msg_namelen,
980 1004 sizeof (lmsg.msg_namelen)))
981 1005 return (set_errno(EFAULT));
982 1006 } else {
983 1007 lmsg.msg_namelen = 0;
984 1008 }
985 1009 lmsg.msg_controllen = 0;
986 1010 lmsg.msg_flags = 0;
987 1011
988 1012 return (recvit(sock, &lmsg, &auio, flags, namelenp, NULL, NULL));
989 1013 }
990 1014
991 1015 /*
992 1016 * Uses the MSG_XPG4_2 flag to determine if the caller is using
993 1017 * struct omsghdr or struct nmsghdr.
994 1018 */
995 1019 ssize_t
996 1020 recvmsg(int sock, struct nmsghdr *msg, int flags)
997 1021 {
998 1022 STRUCT_DECL(nmsghdr, u_lmsg);
999 1023 STRUCT_HANDLE(nmsghdr, umsgptr);
1000 1024 struct nmsghdr lmsg;
1001 1025 struct uio auio;
1002 1026 struct iovec aiov[MSG_MAXIOVLEN];
1003 1027 int iovcnt;
1004 1028 ssize_t len;
1005 1029 int i;
1006 1030 int *flagsp;
1007 1031 model_t model;
1008 1032
1009 1033 dprint(1, ("recvmsg(%d, %p, %d)\n",
1010 1034 sock, (void *)msg, flags));
1011 1035
1012 1036 model = get_udatamodel();
1013 1037 STRUCT_INIT(u_lmsg, model);
1014 1038 STRUCT_SET_HANDLE(umsgptr, model, msg);
1015 1039
1016 1040 if (flags & MSG_XPG4_2) {
1017 1041 if (copyin(msg, STRUCT_BUF(u_lmsg), STRUCT_SIZE(u_lmsg)))
1018 1042 return (set_errno(EFAULT));
1019 1043 flagsp = STRUCT_FADDR(umsgptr, msg_flags);
1020 1044 } else {
1021 1045 /*
1022 1046 * Assumes that nmsghdr and omsghdr are identically shaped
1023 1047 * except for the added msg_flags field.
1024 1048 */
1025 1049 if (copyin(msg, STRUCT_BUF(u_lmsg),
1026 1050 SIZEOF_STRUCT(omsghdr, model)))
1027 1051 return (set_errno(EFAULT));
1028 1052 STRUCT_FSET(u_lmsg, msg_flags, 0);
1029 1053 flagsp = NULL;
1030 1054 }
1031 1055
1032 1056 /*
1033 1057 * Code below us will kmem_alloc memory and hang it
1034 1058 * off msg_control and msg_name fields. This forces
1035 1059 * us to copy the structure to its native form.
1036 1060 */
1037 1061 lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name);
1038 1062 lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen);
1039 1063 lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov);
1040 1064 lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen);
1041 1065 lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control);
1042 1066 lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen);
1043 1067 lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags);
1044 1068
1045 1069 iovcnt = lmsg.msg_iovlen;
1046 1070
1047 1071 if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
1048 1072 return (set_errno(EMSGSIZE));
1049 1073 }
1050 1074
1051 1075 #ifdef _SYSCALL32_IMPL
1052 1076 /*
1053 1077 * 32-bit callers need to have their iovec expanded, while ensuring
1054 1078 * that they can't move more than 2Gbytes of data in a single call.
1055 1079 */
1056 1080 if (model == DATAMODEL_ILP32) {
1057 1081 struct iovec32 aiov32[MSG_MAXIOVLEN];
1058 1082 ssize32_t count32;
1059 1083
1060 1084 if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
1061 1085 iovcnt * sizeof (struct iovec32)))
1062 1086 return (set_errno(EFAULT));
1063 1087
1064 1088 count32 = 0;
1065 1089 for (i = 0; i < iovcnt; i++) {
1066 1090 ssize32_t iovlen32;
1067 1091
1068 1092 iovlen32 = aiov32[i].iov_len;
1069 1093 count32 += iovlen32;
1070 1094 if (iovlen32 < 0 || count32 < 0)
1071 1095 return (set_errno(EINVAL));
1072 1096 aiov[i].iov_len = iovlen32;
1073 1097 aiov[i].iov_base =
1074 1098 (caddr_t)(uintptr_t)aiov32[i].iov_base;
1075 1099 }
1076 1100 } else
1077 1101 #endif /* _SYSCALL32_IMPL */
1078 1102 if (copyin(lmsg.msg_iov, aiov, iovcnt * sizeof (struct iovec))) {
1079 1103 return (set_errno(EFAULT));
1080 1104 }
1081 1105 len = 0;
1082 1106 for (i = 0; i < iovcnt; i++) {
1083 1107 ssize_t iovlen = aiov[i].iov_len;
1084 1108 len += iovlen;
1085 1109 if (iovlen < 0 || len < 0) {
1086 1110 return (set_errno(EINVAL));
1087 1111 }
1088 1112 }
1089 1113 auio.uio_loffset = 0;
1090 1114 auio.uio_iov = aiov;
1091 1115 auio.uio_iovcnt = iovcnt;
1092 1116 auio.uio_resid = len;
1093 1117 auio.uio_segflg = UIO_USERSPACE;
1094 1118 auio.uio_limit = 0;
1095 1119
1096 1120 if (lmsg.msg_control != NULL &&
1097 1121 (do_useracc == 0 ||
1098 1122 useracc(lmsg.msg_control, lmsg.msg_controllen,
1099 1123 B_WRITE) != 0)) {
1100 1124 return (set_errno(EFAULT));
1101 1125 }
1102 1126
1103 1127 return (recvit(sock, &lmsg, &auio, flags,
1104 1128 STRUCT_FADDR(umsgptr, msg_namelen),
1105 1129 STRUCT_FADDR(umsgptr, msg_controllen), flagsp));
1106 1130 }
1107 1131
1108 1132 /*
1109 1133 * Common send function.
1110 1134 */
1111 1135 static ssize_t
1112 1136 sendit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags)
1113 1137 {
1114 1138 struct sonode *so;
1115 1139 file_t *fp;
1116 1140 void *name;
1117 1141 socklen_t namelen;
1118 1142 void *control;
1119 1143 socklen_t controllen;
1120 1144 ssize_t len;
1121 1145 int error;
1122 1146
1123 1147 if ((so = getsonode(sock, &error, &fp)) == NULL)
1124 1148 return (set_errno(error));
1125 1149
1126 1150 uiop->uio_fmode = fp->f_flag;
1127 1151
1128 1152 if (so->so_family == AF_UNIX)
1129 1153 uiop->uio_extflg = UIO_COPY_CACHED;
1130 1154 else
1131 1155 uiop->uio_extflg = UIO_COPY_DEFAULT;
1132 1156
1133 1157 /* Allocate and copyin name and control */
1134 1158 name = msg->msg_name;
1135 1159 namelen = msg->msg_namelen;
1136 1160 if (name != NULL && namelen != 0) {
1137 1161 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1138 1162 name = copyin_name(so,
1139 1163 (struct sockaddr *)name,
1140 1164 &namelen, &error);
1141 1165 if (name == NULL)
1142 1166 goto done3;
1143 1167 /* copyin_name null terminates addresses for AF_UNIX */
1144 1168 msg->msg_namelen = namelen;
1145 1169 msg->msg_name = name;
1146 1170 } else {
1147 1171 msg->msg_name = name = NULL;
1148 1172 msg->msg_namelen = namelen = 0;
1149 1173 }
1150 1174
1151 1175 control = msg->msg_control;
1152 1176 controllen = msg->msg_controllen;
1153 1177 if ((control != NULL) && (controllen != 0)) {
1154 1178 /*
1155 1179 * Verify that the length is not excessive to prevent
1156 1180 * an application from consuming all of kernel memory.
1157 1181 */
1158 1182 if (controllen > SO_MAXARGSIZE) {
1159 1183 error = EINVAL;
1160 1184 goto done2;
1161 1185 }
1162 1186 control = kmem_alloc(controllen, KM_SLEEP);
1163 1187
1164 1188 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1165 1189 if (copyin(msg->msg_control, control, controllen)) {
1166 1190 error = EFAULT;
1167 1191 goto done1;
1168 1192 }
1169 1193 msg->msg_control = control;
1170 1194 } else {
1171 1195 msg->msg_control = control = NULL;
1172 1196 msg->msg_controllen = controllen = 0;
1173 1197 }
1174 1198
1175 1199 len = uiop->uio_resid;
1176 1200 msg->msg_flags = flags;
1177 1201
1178 1202 error = socket_sendmsg(so, msg, uiop, CRED());
1179 1203 done1:
1180 1204 if (control != NULL)
1181 1205 kmem_free(control, controllen);
1182 1206 done2:
1183 1207 if (name != NULL)
1184 1208 kmem_free(name, namelen);
1185 1209 done3:
1186 1210 if (error != 0) {
1187 1211 releasef(sock);
1188 1212 return (set_errno(error));
1189 1213 }
1190 1214 lwp_stat_update(LWP_STAT_MSGSND, 1);
1191 1215 releasef(sock);
1192 1216 return (len - uiop->uio_resid);
1193 1217 }
1194 1218
1195 1219 /*
1196 1220 * Native system call
1197 1221 */
1198 1222 ssize_t
1199 1223 send(int sock, void *buffer, size_t len, int flags)
1200 1224 {
1201 1225 struct nmsghdr lmsg;
1202 1226 struct uio auio;
1203 1227 struct iovec aiov[1];
1204 1228
1205 1229 dprint(1, ("send(%d, %p, %ld, %d)\n",
1206 1230 sock, buffer, len, flags));
1207 1231
1208 1232 if ((ssize_t)len < 0) {
1209 1233 return (set_errno(EINVAL));
1210 1234 }
1211 1235
1212 1236 aiov[0].iov_base = buffer;
1213 1237 aiov[0].iov_len = len;
1214 1238 auio.uio_loffset = 0;
1215 1239 auio.uio_iov = aiov;
1216 1240 auio.uio_iovcnt = 1;
1217 1241 auio.uio_resid = len;
1218 1242 auio.uio_segflg = UIO_USERSPACE;
1219 1243 auio.uio_limit = 0;
1220 1244
1221 1245 lmsg.msg_name = NULL;
1222 1246 lmsg.msg_control = NULL;
1223 1247 if (!(flags & MSG_XPG4_2)) {
1224 1248 /*
1225 1249 * In order to be compatible with the libsocket/sockmod
1226 1250 * implementation we set EOR for all send* calls.
1227 1251 */
1228 1252 flags |= MSG_EOR;
1229 1253 }
1230 1254 return (sendit(sock, &lmsg, &auio, flags));
1231 1255 }
1232 1256
1233 1257 /*
1234 1258 * Uses the MSG_XPG4_2 flag to determine if the caller is using
1235 1259 * struct omsghdr or struct nmsghdr.
1236 1260 */
1237 1261 ssize_t
1238 1262 sendmsg(int sock, struct nmsghdr *msg, int flags)
1239 1263 {
1240 1264 struct nmsghdr lmsg;
1241 1265 STRUCT_DECL(nmsghdr, u_lmsg);
1242 1266 struct uio auio;
1243 1267 struct iovec aiov[MSG_MAXIOVLEN];
1244 1268 int iovcnt;
1245 1269 ssize_t len;
1246 1270 int i;
1247 1271 model_t model;
1248 1272
1249 1273 dprint(1, ("sendmsg(%d, %p, %d)\n", sock, (void *)msg, flags));
1250 1274
1251 1275 model = get_udatamodel();
1252 1276 STRUCT_INIT(u_lmsg, model);
1253 1277
1254 1278 if (flags & MSG_XPG4_2) {
1255 1279 if (copyin(msg, (char *)STRUCT_BUF(u_lmsg),
1256 1280 STRUCT_SIZE(u_lmsg)))
1257 1281 return (set_errno(EFAULT));
1258 1282 } else {
1259 1283 /*
1260 1284 * Assumes that nmsghdr and omsghdr are identically shaped
1261 1285 * except for the added msg_flags field.
1262 1286 */
1263 1287 if (copyin(msg, (char *)STRUCT_BUF(u_lmsg),
1264 1288 SIZEOF_STRUCT(omsghdr, model)))
1265 1289 return (set_errno(EFAULT));
1266 1290 /*
1267 1291 * In order to be compatible with the libsocket/sockmod
1268 1292 * implementation we set EOR for all send* calls.
1269 1293 */
1270 1294 flags |= MSG_EOR;
1271 1295 }
1272 1296
1273 1297 /*
1274 1298 * Code below us will kmem_alloc memory and hang it
1275 1299 * off msg_control and msg_name fields. This forces
1276 1300 * us to copy the structure to its native form.
1277 1301 */
1278 1302 lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name);
1279 1303 lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen);
1280 1304 lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov);
1281 1305 lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen);
1282 1306 lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control);
1283 1307 lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen);
1284 1308 lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags);
1285 1309
1286 1310 iovcnt = lmsg.msg_iovlen;
1287 1311
1288 1312 if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
1289 1313 /*
1290 1314 * Unless this is XPG 4.2 we allow iovcnt == 0 to
1291 1315 * be compatible with SunOS 4.X and 4.4BSD.
1292 1316 */
1293 1317 if (iovcnt != 0 || (flags & MSG_XPG4_2))
1294 1318 return (set_errno(EMSGSIZE));
1295 1319 }
1296 1320
1297 1321 #ifdef _SYSCALL32_IMPL
1298 1322 /*
1299 1323 * 32-bit callers need to have their iovec expanded, while ensuring
1300 1324 * that they can't move more than 2Gbytes of data in a single call.
1301 1325 */
1302 1326 if (model == DATAMODEL_ILP32) {
1303 1327 struct iovec32 aiov32[MSG_MAXIOVLEN];
1304 1328 ssize32_t count32;
1305 1329
1306 1330 if (iovcnt != 0 &&
1307 1331 copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
1308 1332 iovcnt * sizeof (struct iovec32)))
1309 1333 return (set_errno(EFAULT));
1310 1334
1311 1335 count32 = 0;
1312 1336 for (i = 0; i < iovcnt; i++) {
1313 1337 ssize32_t iovlen32;
1314 1338
1315 1339 iovlen32 = aiov32[i].iov_len;
1316 1340 count32 += iovlen32;
1317 1341 if (iovlen32 < 0 || count32 < 0)
1318 1342 return (set_errno(EINVAL));
1319 1343 aiov[i].iov_len = iovlen32;
1320 1344 aiov[i].iov_base =
1321 1345 (caddr_t)(uintptr_t)aiov32[i].iov_base;
1322 1346 }
1323 1347 } else
1324 1348 #endif /* _SYSCALL32_IMPL */
1325 1349 if (iovcnt != 0 &&
1326 1350 copyin(lmsg.msg_iov, aiov,
1327 1351 (unsigned)iovcnt * sizeof (struct iovec))) {
1328 1352 return (set_errno(EFAULT));
1329 1353 }
1330 1354 len = 0;
1331 1355 for (i = 0; i < iovcnt; i++) {
1332 1356 ssize_t iovlen = aiov[i].iov_len;
1333 1357 len += iovlen;
1334 1358 if (iovlen < 0 || len < 0) {
1335 1359 return (set_errno(EINVAL));
1336 1360 }
1337 1361 }
1338 1362 auio.uio_loffset = 0;
1339 1363 auio.uio_iov = aiov;
1340 1364 auio.uio_iovcnt = iovcnt;
1341 1365 auio.uio_resid = len;
1342 1366 auio.uio_segflg = UIO_USERSPACE;
1343 1367 auio.uio_limit = 0;
1344 1368
1345 1369 return (sendit(sock, &lmsg, &auio, flags));
1346 1370 }
1347 1371
1348 1372 ssize_t
1349 1373 sendto(int sock, void *buffer, size_t len, int flags,
1350 1374 struct sockaddr *name, socklen_t namelen)
1351 1375 {
1352 1376 struct nmsghdr lmsg;
1353 1377 struct uio auio;
1354 1378 struct iovec aiov[1];
1355 1379
1356 1380 dprint(1, ("sendto(%d, %p, %ld, %d, %p, %d)\n",
1357 1381 sock, buffer, len, flags, (void *)name, namelen));
1358 1382
1359 1383 if ((ssize_t)len < 0) {
1360 1384 return (set_errno(EINVAL));
1361 1385 }
1362 1386
1363 1387 aiov[0].iov_base = buffer;
1364 1388 aiov[0].iov_len = len;
1365 1389 auio.uio_loffset = 0;
1366 1390 auio.uio_iov = aiov;
1367 1391 auio.uio_iovcnt = 1;
1368 1392 auio.uio_resid = len;
1369 1393 auio.uio_segflg = UIO_USERSPACE;
1370 1394 auio.uio_limit = 0;
1371 1395
1372 1396 lmsg.msg_name = (char *)name;
1373 1397 lmsg.msg_namelen = namelen;
1374 1398 lmsg.msg_control = NULL;
1375 1399 if (!(flags & MSG_XPG4_2)) {
1376 1400 /*
1377 1401 * In order to be compatible with the libsocket/sockmod
1378 1402 * implementation we set EOR for all send* calls.
1379 1403 */
1380 1404 flags |= MSG_EOR;
1381 1405 }
1382 1406 return (sendit(sock, &lmsg, &auio, flags));
1383 1407 }
1384 1408
1385 1409 /*ARGSUSED3*/
1386 1410 int
1387 1411 getpeername(int sock, struct sockaddr *name, socklen_t *namelenp, int version)
1388 1412 {
1389 1413 struct sonode *so;
1390 1414 int error;
1391 1415 socklen_t namelen;
1392 1416 socklen_t sock_addrlen;
1393 1417 struct sockaddr *sock_addrp;
1394 1418
1395 1419 dprint(1, ("getpeername(%d, %p, %p)\n",
1396 1420 sock, (void *)name, (void *)namelenp));
1397 1421
1398 1422 if ((so = getsonode(sock, &error, NULL)) == NULL)
1399 1423 goto bad;
1400 1424
1401 1425 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1402 1426 if (copyin(namelenp, &namelen, sizeof (namelen)) ||
1403 1427 (name == NULL && namelen != 0)) {
1404 1428 error = EFAULT;
1405 1429 goto rel_out;
1406 1430 }
1407 1431 sock_addrlen = so->so_max_addr_len;
1408 1432 sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP);
1409 1433
1410 1434 if ((error = socket_getpeername(so, sock_addrp, &sock_addrlen,
1411 1435 B_FALSE, CRED())) == 0) {
1412 1436 ASSERT(sock_addrlen <= so->so_max_addr_len);
1413 1437 error = copyout_name(name, namelen, namelenp,
1414 1438 (void *)sock_addrp, sock_addrlen);
1415 1439 }
1416 1440 kmem_free(sock_addrp, so->so_max_addr_len);
1417 1441 rel_out:
1418 1442 releasef(sock);
1419 1443 bad: return (error != 0 ? set_errno(error) : 0);
1420 1444 }
1421 1445
1422 1446 /*ARGSUSED3*/
1423 1447 int
1424 1448 getsockname(int sock, struct sockaddr *name,
1425 1449 socklen_t *namelenp, int version)
1426 1450 {
1427 1451 struct sonode *so;
1428 1452 int error;
1429 1453 socklen_t namelen, sock_addrlen;
1430 1454 struct sockaddr *sock_addrp;
1431 1455
1432 1456 dprint(1, ("getsockname(%d, %p, %p)\n",
1433 1457 sock, (void *)name, (void *)namelenp));
1434 1458
1435 1459 if ((so = getsonode(sock, &error, NULL)) == NULL)
1436 1460 goto bad;
1437 1461
1438 1462 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1439 1463 if (copyin(namelenp, &namelen, sizeof (namelen)) ||
1440 1464 (name == NULL && namelen != 0)) {
1441 1465 error = EFAULT;
1442 1466 goto rel_out;
1443 1467 }
1444 1468
1445 1469 sock_addrlen = so->so_max_addr_len;
1446 1470 sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP);
1447 1471 if ((error = socket_getsockname(so, sock_addrp, &sock_addrlen,
1448 1472 CRED())) == 0) {
1449 1473 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1450 1474 ASSERT(sock_addrlen <= so->so_max_addr_len);
1451 1475 error = copyout_name(name, namelen, namelenp,
1452 1476 (void *)sock_addrp, sock_addrlen);
1453 1477 }
1454 1478 kmem_free(sock_addrp, so->so_max_addr_len);
1455 1479 rel_out:
1456 1480 releasef(sock);
1457 1481 bad: return (error != 0 ? set_errno(error) : 0);
1458 1482 }
1459 1483
1460 1484 /*ARGSUSED5*/
1461 1485 int
1462 1486 getsockopt(int sock,
1463 1487 int level,
1464 1488 int option_name,
1465 1489 void *option_value,
1466 1490 socklen_t *option_lenp,
1467 1491 int version)
1468 1492 {
1469 1493 struct sonode *so;
1470 1494 socklen_t optlen, optlen_res;
1471 1495 void *optval;
1472 1496 int error;
1473 1497
1474 1498 dprint(1, ("getsockopt(%d, %d, %d, %p, %p)\n",
1475 1499 sock, level, option_name, option_value, (void *)option_lenp));
1476 1500
1477 1501 if ((so = getsonode(sock, &error, NULL)) == NULL)
1478 1502 return (set_errno(error));
1479 1503
1480 1504 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1481 1505 if (copyin(option_lenp, &optlen, sizeof (optlen))) {
1482 1506 releasef(sock);
1483 1507 return (set_errno(EFAULT));
1484 1508 }
1485 1509 /*
1486 1510 * Verify that the length is not excessive to prevent
1487 1511 * an application from consuming all of kernel memory.
1488 1512 */
1489 1513 if (optlen > SO_MAXARGSIZE) {
1490 1514 error = EINVAL;
1491 1515 releasef(sock);
1492 1516 return (set_errno(error));
1493 1517 }
1494 1518 optval = kmem_alloc(optlen, KM_SLEEP);
1495 1519 optlen_res = optlen;
1496 1520 error = socket_getsockopt(so, level, option_name, optval,
1497 1521 &optlen_res, (version != SOV_XPG4_2) ? 0 : _SOGETSOCKOPT_XPG4_2,
1498 1522 CRED());
1499 1523 releasef(sock);
1500 1524 if (error) {
1501 1525 kmem_free(optval, optlen);
1502 1526 return (set_errno(error));
1503 1527 }
1504 1528 error = copyout_arg(option_value, optlen, option_lenp,
1505 1529 optval, optlen_res);
1506 1530 kmem_free(optval, optlen);
1507 1531 if (error)
1508 1532 return (set_errno(error));
1509 1533 return (0);
1510 1534 }
1511 1535
1512 1536 /*ARGSUSED5*/
1513 1537 int
1514 1538 setsockopt(int sock,
1515 1539 int level,
1516 1540 int option_name,
1517 1541 void *option_value,
1518 1542 socklen_t option_len,
1519 1543 int version)
1520 1544 {
1521 1545 struct sonode *so;
1522 1546 intptr_t buffer[2];
1523 1547 void *optval = NULL;
1524 1548 int error;
1525 1549
1526 1550 dprint(1, ("setsockopt(%d, %d, %d, %p, %d)\n",
1527 1551 sock, level, option_name, option_value, option_len));
1528 1552
1529 1553 if ((so = getsonode(sock, &error, NULL)) == NULL)
1530 1554 return (set_errno(error));
1531 1555
1532 1556 if (option_value != NULL) {
1533 1557 if (option_len != 0) {
1534 1558 /*
1535 1559 * Verify that the length is not excessive to prevent
1536 1560 * an application from consuming all of kernel memory.
1537 1561 */
1538 1562 if (option_len > SO_MAXARGSIZE) {
1539 1563 error = EINVAL;
1540 1564 goto done2;
1541 1565 }
1542 1566 optval = option_len <= sizeof (buffer) ?
1543 1567 &buffer : kmem_alloc((size_t)option_len, KM_SLEEP);
1544 1568 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1545 1569 if (copyin(option_value, optval, (size_t)option_len)) {
1546 1570 error = EFAULT;
1547 1571 goto done1;
1548 1572 }
1549 1573 }
1550 1574 } else
1551 1575 option_len = 0;
1552 1576
1553 1577 error = socket_setsockopt(so, level, option_name, optval,
1554 1578 (t_uscalar_t)option_len, CRED());
1555 1579 done1:
1556 1580 if (optval != buffer)
1557 1581 kmem_free(optval, (size_t)option_len);
1558 1582 done2:
1559 1583 releasef(sock);
1560 1584 if (error)
1561 1585 return (set_errno(error));
1562 1586 return (0);
1563 1587 }
1564 1588
1565 1589 static int
1566 1590 sockconf_add_sock(int family, int type, int protocol, char *name)
1567 1591 {
1568 1592 int error = 0;
1569 1593 char *kdevpath = NULL;
1570 1594 char *kmodule = NULL;
1571 1595 char *buf = NULL;
1572 1596 size_t pathlen = 0;
1573 1597 struct sockparams *sp;
1574 1598
1575 1599 if (name == NULL)
1576 1600 return (EINVAL);
1577 1601 /*
1578 1602 * Copyin the name.
1579 1603 * This also makes it possible to check for too long pathnames.
1580 1604 * Compress the space needed for the name before passing it
1581 1605 * to soconfig - soconfig will store the string until
1582 1606 * the configuration is removed.
1583 1607 */
1584 1608 buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1585 1609 if ((error = copyinstr(name, buf, MAXPATHLEN, &pathlen)) != 0) {
1586 1610 kmem_free(buf, MAXPATHLEN);
1587 1611 return (error);
1588 1612 }
1589 1613 if (strncmp(buf, "/dev", strlen("/dev")) == 0) {
1590 1614 /* For device */
1591 1615
1592 1616 /*
1593 1617 * Special handling for NCA:
1594 1618 *
1595 1619 * DEV_NCA is never opened even if an application
1596 1620 * requests for AF_NCA. The device opened is instead a
1597 1621 * predefined AF_INET transport (NCA_INET_DEV).
1598 1622 *
1599 1623 * Prior to Volo (PSARC/2007/587) NCA would determine
1600 1624 * the device using a lookup, which worked then because
1601 1625 * all protocols were based on TPI. Since TPI is no
1602 1626 * longer the default, we have to explicitly state
1603 1627 * which device to use.
1604 1628 */
1605 1629 if (strcmp(buf, NCA_DEV) == 0) {
1606 1630 /* only support entry <28, 2, 0> */
1607 1631 if (family != AF_NCA || type != SOCK_STREAM ||
1608 1632 protocol != 0) {
1609 1633 kmem_free(buf, MAXPATHLEN);
1610 1634 return (EINVAL);
1611 1635 }
1612 1636
1613 1637 pathlen = strlen(NCA_INET_DEV) + 1;
1614 1638 kdevpath = kmem_alloc(pathlen, KM_SLEEP);
1615 1639 bcopy(NCA_INET_DEV, kdevpath, pathlen);
1616 1640 kdevpath[pathlen - 1] = '\0';
1617 1641 } else {
1618 1642 kdevpath = kmem_alloc(pathlen, KM_SLEEP);
1619 1643 bcopy(buf, kdevpath, pathlen);
1620 1644 kdevpath[pathlen - 1] = '\0';
1621 1645 }
1622 1646 } else {
1623 1647 /* For socket module */
1624 1648 kmodule = kmem_alloc(pathlen, KM_SLEEP);
1625 1649 bcopy(buf, kmodule, pathlen);
1626 1650 kmodule[pathlen - 1] = '\0';
1627 1651 pathlen = 0;
1628 1652 }
1629 1653 kmem_free(buf, MAXPATHLEN);
1630 1654
1631 1655 /* sockparams_create frees mod name and devpath upon failure */
1632 1656 sp = sockparams_create(family, type, protocol, kmodule,
1633 1657 kdevpath, pathlen, 0, KM_SLEEP, &error);
1634 1658 if (sp != NULL) {
1635 1659 error = sockparams_add(sp);
1636 1660 if (error != 0)
1637 1661 sockparams_destroy(sp);
1638 1662 }
1639 1663
1640 1664 return (error);
1641 1665 }
1642 1666
1643 1667 static int
1644 1668 sockconf_remove_sock(int family, int type, int protocol)
1645 1669 {
1646 1670 return (sockparams_delete(family, type, protocol));
1647 1671 }
1648 1672
1649 1673 static int
1650 1674 sockconfig_remove_filter(const char *uname)
1651 1675 {
1652 1676 char kname[SOF_MAXNAMELEN];
1653 1677 size_t len;
1654 1678 int error;
1655 1679 sof_entry_t *ent;
1656 1680
1657 1681 if ((error = copyinstr(uname, kname, SOF_MAXNAMELEN, &len)) != 0)
1658 1682 return (error);
1659 1683
1660 1684 ent = sof_entry_remove_by_name(kname);
1661 1685 if (ent == NULL)
1662 1686 return (ENXIO);
1663 1687
1664 1688 mutex_enter(&ent->sofe_lock);
1665 1689 ASSERT(!(ent->sofe_flags & SOFEF_CONDEMED));
1666 1690 if (ent->sofe_refcnt == 0) {
1667 1691 mutex_exit(&ent->sofe_lock);
1668 1692 sof_entry_free(ent);
1669 1693 } else {
1670 1694 /* let the last socket free the filter */
1671 1695 ent->sofe_flags |= SOFEF_CONDEMED;
1672 1696 mutex_exit(&ent->sofe_lock);
1673 1697 }
1674 1698
1675 1699 return (0);
1676 1700 }
1677 1701
1678 1702 static int
1679 1703 sockconfig_add_filter(const char *uname, void *ufilpropp)
1680 1704 {
1681 1705 struct sockconfig_filter_props filprop;
1682 1706 sof_entry_t *ent;
1683 1707 int error;
1684 1708 size_t tuplesz, len;
1685 1709 char hintbuf[SOF_MAXNAMELEN];
1686 1710
1687 1711 ent = kmem_zalloc(sizeof (sof_entry_t), KM_SLEEP);
1688 1712 mutex_init(&ent->sofe_lock, NULL, MUTEX_DEFAULT, NULL);
1689 1713
1690 1714 if ((error = copyinstr(uname, ent->sofe_name, SOF_MAXNAMELEN,
1691 1715 &len)) != 0) {
1692 1716 sof_entry_free(ent);
1693 1717 return (error);
1694 1718 }
1695 1719
1696 1720 if (get_udatamodel() == DATAMODEL_NATIVE) {
1697 1721 if (copyin(ufilpropp, &filprop, sizeof (filprop)) != 0) {
1698 1722 sof_entry_free(ent);
1699 1723 return (EFAULT);
1700 1724 }
1701 1725 }
1702 1726 #ifdef _SYSCALL32_IMPL
1703 1727 else {
1704 1728 struct sockconfig_filter_props32 filprop32;
1705 1729
1706 1730 if (copyin(ufilpropp, &filprop32, sizeof (filprop32)) != 0) {
1707 1731 sof_entry_free(ent);
1708 1732 return (EFAULT);
1709 1733 }
1710 1734 filprop.sfp_modname = (char *)(uintptr_t)filprop32.sfp_modname;
1711 1735 filprop.sfp_autoattach = filprop32.sfp_autoattach;
1712 1736 filprop.sfp_hint = filprop32.sfp_hint;
1713 1737 filprop.sfp_hintarg = (char *)(uintptr_t)filprop32.sfp_hintarg;
1714 1738 filprop.sfp_socktuple_cnt = filprop32.sfp_socktuple_cnt;
1715 1739 filprop.sfp_socktuple =
1716 1740 (sof_socktuple_t *)(uintptr_t)filprop32.sfp_socktuple;
1717 1741 }
1718 1742 #endif /* _SYSCALL32_IMPL */
1719 1743
1720 1744 if ((error = copyinstr(filprop.sfp_modname, ent->sofe_modname,
1721 1745 sizeof (ent->sofe_modname), &len)) != 0) {
1722 1746 sof_entry_free(ent);
1723 1747 return (error);
1724 1748 }
1725 1749
1726 1750 /*
1727 1751 * A filter must specify at least one socket tuple.
1728 1752 */
1729 1753 if (filprop.sfp_socktuple_cnt == 0 ||
1730 1754 filprop.sfp_socktuple_cnt > SOF_MAXSOCKTUPLECNT) {
1731 1755 sof_entry_free(ent);
1732 1756 return (EINVAL);
1733 1757 }
1734 1758 ent->sofe_flags = filprop.sfp_autoattach ? SOFEF_AUTO : SOFEF_PROG;
1735 1759 ent->sofe_hint = filprop.sfp_hint;
1736 1760
1737 1761 /*
1738 1762 * Verify the hint, and copy in the hint argument, if necessary.
1739 1763 */
1740 1764 switch (ent->sofe_hint) {
1741 1765 case SOF_HINT_BEFORE:
1742 1766 case SOF_HINT_AFTER:
1743 1767 if ((error = copyinstr(filprop.sfp_hintarg, hintbuf,
1744 1768 sizeof (hintbuf), &len)) != 0) {
1745 1769 sof_entry_free(ent);
1746 1770 return (error);
1747 1771 }
1748 1772 ent->sofe_hintarg = kmem_alloc(len, KM_SLEEP);
1749 1773 bcopy(hintbuf, ent->sofe_hintarg, len);
1750 1774 /* FALLTHRU */
1751 1775 case SOF_HINT_TOP:
1752 1776 case SOF_HINT_BOTTOM:
1753 1777 /* hints cannot be used with programmatic filters */
1754 1778 if (ent->sofe_flags & SOFEF_PROG) {
1755 1779 sof_entry_free(ent);
1756 1780 return (EINVAL);
1757 1781 }
1758 1782 break;
1759 1783 case SOF_HINT_NONE:
1760 1784 break;
1761 1785 default:
1762 1786 /* bad hint value */
1763 1787 sof_entry_free(ent);
1764 1788 return (EINVAL);
1765 1789 }
1766 1790
1767 1791 ent->sofe_socktuple_cnt = filprop.sfp_socktuple_cnt;
1768 1792 tuplesz = sizeof (sof_socktuple_t) * ent->sofe_socktuple_cnt;
1769 1793 ent->sofe_socktuple = kmem_alloc(tuplesz, KM_SLEEP);
1770 1794
1771 1795 if (get_udatamodel() == DATAMODEL_NATIVE) {
1772 1796 if (copyin(filprop.sfp_socktuple, ent->sofe_socktuple,
1773 1797 tuplesz)) {
1774 1798 sof_entry_free(ent);
1775 1799 return (EFAULT);
1776 1800 }
1777 1801 }
1778 1802 #ifdef _SYSCALL32_IMPL
1779 1803 else {
1780 1804 int i;
1781 1805 caddr_t data = (caddr_t)filprop.sfp_socktuple;
1782 1806 sof_socktuple_t *tup = ent->sofe_socktuple;
1783 1807 sof_socktuple32_t tup32;
1784 1808
1785 1809 tup = ent->sofe_socktuple;
1786 1810 for (i = 0; i < ent->sofe_socktuple_cnt; i++, tup++) {
1787 1811 ASSERT(tup < ent->sofe_socktuple + tuplesz);
1788 1812
1789 1813 if (copyin(data, &tup32, sizeof (tup32)) != 0) {
1790 1814 sof_entry_free(ent);
1791 1815 return (EFAULT);
1792 1816 }
1793 1817 tup->sofst_family = tup32.sofst_family;
1794 1818 tup->sofst_type = tup32.sofst_type;
1795 1819 tup->sofst_protocol = tup32.sofst_protocol;
1796 1820
1797 1821 data += sizeof (tup32);
1798 1822 }
1799 1823 }
1800 1824 #endif /* _SYSCALL32_IMPL */
1801 1825
1802 1826 /* Sockets can start using the filter as soon as the filter is added */
1803 1827 if ((error = sof_entry_add(ent)) != 0)
1804 1828 sof_entry_free(ent);
1805 1829
1806 1830 return (error);
1807 1831 }
1808 1832
1809 1833 /*
1810 1834 * Socket configuration system call. It is used to add and remove
1811 1835 * socket types.
1812 1836 */
1813 1837 int
1814 1838 sockconfig(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
1815 1839 {
1816 1840 int error = 0;
1817 1841
1818 1842 if (secpolicy_net_config(CRED(), B_FALSE) != 0)
1819 1843 return (set_errno(EPERM));
1820 1844
1821 1845 if (sockfs_defer_nl7c_init) {
1822 1846 nl7c_init();
1823 1847 sockfs_defer_nl7c_init = 0;
1824 1848 }
1825 1849
1826 1850 switch (cmd) {
1827 1851 case SOCKCONFIG_ADD_SOCK:
1828 1852 error = sockconf_add_sock((int)(uintptr_t)arg1,
1829 1853 (int)(uintptr_t)arg2, (int)(uintptr_t)arg3, arg4);
1830 1854 break;
1831 1855 case SOCKCONFIG_REMOVE_SOCK:
1832 1856 error = sockconf_remove_sock((int)(uintptr_t)arg1,
1833 1857 (int)(uintptr_t)arg2, (int)(uintptr_t)arg3);
1834 1858 break;
1835 1859 case SOCKCONFIG_ADD_FILTER:
1836 1860 error = sockconfig_add_filter((const char *)arg1, arg2);
1837 1861 break;
1838 1862 case SOCKCONFIG_REMOVE_FILTER:
1839 1863 error = sockconfig_remove_filter((const char *)arg1);
1840 1864 break;
1841 1865 default:
1842 1866 #ifdef DEBUG
1843 1867 cmn_err(CE_NOTE, "sockconfig: unkonwn subcommand %d", cmd);
1844 1868 #endif
1845 1869 error = EINVAL;
1846 1870 break;
1847 1871 }
1848 1872
1849 1873 if (error != 0) {
1850 1874 eprintline(error);
1851 1875 return (set_errno(error));
1852 1876 }
1853 1877 return (0);
1854 1878 }
1855 1879
1856 1880
1857 1881 /*
1858 1882 * Sendfile is implemented through two schemes, direct I/O or by
1859 1883 * caching in the filesystem page cache. We cache the input file by
1860 1884 * default and use direct I/O only if sendfile_max_size is set
1861 1885 * appropriately as explained below. Note that this logic is consistent
1862 1886 * with other filesystems where caching is turned on by default
1863 1887 * unless explicitly turned off by using the DIRECTIO ioctl.
1864 1888 *
1865 1889 * We choose a slightly different scheme here. One can turn off
1866 1890 * caching by setting sendfile_max_size to 0. One can also enable
1867 1891 * caching of files <= sendfile_max_size by setting sendfile_max_size
1868 1892 * to an appropriate value. By default sendfile_max_size is set to the
1869 1893 * maximum value so that all files are cached. In future, we may provide
1870 1894 * better interfaces for caching the file.
1871 1895 *
1872 1896 * Sendfile through Direct I/O (Zero copy)
1873 1897 * --------------------------------------
1874 1898 *
1875 1899 * As disks are normally slower than the network, we can't have a
1876 1900 * single thread that reads the disk and writes to the network. We
1877 1901 * need to have parallelism. This is done by having the sendfile
1878 1902 * thread create another thread that reads from the filesystem
1879 1903 * and queues it for network processing. In this scheme, the data
1880 1904 * is never copied anywhere i.e it is zero copy unlike the other
1881 1905 * scheme.
1882 1906 *
1883 1907 * We have a sendfile queue (snfq) where each sendfile
1884 1908 * request (snf_req_t) is queued for processing by a thread. Number
1885 1909 * of threads is dynamically allocated and they exit if they are idling
1886 1910 * beyond a specified amount of time. When each request (snf_req_t) is
1887 1911 * processed by a thread, it produces a number of mblk_t structures to
1888 1912 * be consumed by the sendfile thread. snf_deque and snf_enque are
1889 1913 * used for consuming and producing mblks. Size of the filesystem
1890 1914 * read is determined by the tunable (sendfile_read_size). A single
1891 1915 * mblk holds sendfile_read_size worth of data (except the last
1892 1916 * read of the file) which is sent down as a whole to the network.
1893 1917 * sendfile_read_size is set to 1 MB as this seems to be the optimal
1894 1918 * value for the UFS filesystem backed by a striped storage array.
1895 1919 *
1896 1920 * Synchronisation between read (producer) and write (consumer) threads.
1897 1921 * --------------------------------------------------------------------
1898 1922 *
1899 1923 * sr_lock protects sr_ib_head and sr_ib_tail. The lock is held while
1900 1924 * adding and deleting items in this list. Error can happen anytime
1901 1925 * during read or write. There could be unprocessed mblks in the
1902 1926 * sr_ib_XXX list when a read or write error occurs. Whenever error
1903 1927 * is encountered, we need two things to happen :
1904 1928 *
1905 1929 * a) One of the threads need to clean the mblks.
1906 1930 * b) When one thread encounters an error, the other should stop.
1907 1931 *
1908 1932 * For (a), we don't want to penalize the reader thread as it could do
1909 1933 * some useful work processing other requests. For (b), the error can
1910 1934 * be detected by examining sr_read_error or sr_write_error.
1911 1935 * sr_lock protects sr_read_error and sr_write_error. If both reader and
1912 1936 * writer encounters error, we need to report the write error back to
1913 1937 * the application as that's what would have happened if the operations
1914 1938 * were done sequentially. With this in mind, following should work :
1915 1939 *
1916 1940 * - Check for errors before read or write.
1917 1941 * - If the reader encounters error, set the error in sr_read_error.
1918 1942 * Check sr_write_error, if it is set, send cv_signal as it is
1919 1943 * waiting for reader to complete. If it is not set, the writer
1920 1944 * is either running sinking data to the network or blocked
1921 1945 * because of flow control. For handling the latter case, we
1922 1946 * always send a signal. In any case, it will examine sr_read_error
1923 1947 * and return. sr_read_error is marked with SR_READ_DONE to tell
1924 1948 * the writer that the reader is done in all the cases.
1925 1949 * - If the writer encounters error, set the error in sr_write_error.
1926 1950 * The reader thread is either blocked because of flow control or
1927 1951 * running reading data from the disk. For the former, we need to
1928 1952 * wakeup the thread. Again to keep it simple, we always wake up
1929 1953 * the reader thread. Then, wait for the read thread to complete
1930 1954 * if it is not done yet. Cleanup and return.
1931 1955 *
1932 1956 * High and low water marks for the read thread.
1933 1957 * --------------------------------------------
1934 1958 *
1935 1959 * If sendfile() is used to send data over a slow network, we need to
1936 1960 * make sure that the read thread does not produce data at a faster
1937 1961 * rate than the network. This can happen if the disk is faster than
1938 1962 * the network. In such a case, we don't want to build a very large queue.
1939 1963 * But we would still like to get all of the network throughput possible.
1940 1964 * This implies that network should never block waiting for data.
1941 1965 * As there are lot of disk throughput/network throughput combinations
1942 1966 * possible, it is difficult to come up with an accurate number.
1943 1967 * A typical 10K RPM disk has a max seek latency 17ms and rotational
1944 1968 * latency of 3ms for reading a disk block. Thus, the total latency to
1945 1969 * initiate a new read, transfer data from the disk and queue for
1946 1970 * transmission would take about a max of 25ms. Todays max transfer rate
1947 1971 * for network is 100MB/sec. If the thread is blocked because of flow
1948 1972 * control, it would take 25ms to get new data ready for transmission.
1949 1973 * We have to make sure that network is not idling, while we are initiating
1950 1974 * new transfers. So, at 100MB/sec, to keep network busy we would need
1951 1975 * 2.5MB of data. Rounding off, we keep the low water mark to be 3MB of data.
1952 1976 * We need to pick a high water mark so that the woken up thread would
1953 1977 * do considerable work before blocking again to prevent thrashing. Currently,
1954 1978 * we pick this to be 10 times that of the low water mark.
1955 1979 *
1956 1980 * Sendfile with segmap caching (One copy from page cache to mblks).
1957 1981 * ----------------------------------------------------------------
1958 1982 *
1959 1983 * We use the segmap cache for caching the file, if the size of file
1960 1984 * is <= sendfile_max_size. In this case we don't use threads as VM
1961 1985 * is reasonably fast enough to keep up with the network. If the underlying
1962 1986 * transport allows, we call segmap_getmapflt() to map MAXBSIZE (8K) worth
1963 1987 * of data into segmap space, and use the virtual address from segmap
1964 1988 * directly through desballoc() to avoid copy. Once the transport is done
1965 1989 * with the data, the mapping will be released through segmap_release()
1966 1990 * called by the call-back routine.
1967 1991 *
1968 1992 * If zero-copy is not allowed by the transport, we simply call VOP_READ()
1969 1993 * to copy the data from the filesystem into our temporary network buffer.
1970 1994 *
1971 1995 * To disable caching, set sendfile_max_size to 0.
1972 1996 */
1973 1997
1974 1998 uint_t sendfile_read_size = 1024 * 1024;
1975 1999 #define SENDFILE_REQ_LOWAT 3 * 1024 * 1024
1976 2000 uint_t sendfile_req_lowat = SENDFILE_REQ_LOWAT;
1977 2001 uint_t sendfile_req_hiwat = 10 * SENDFILE_REQ_LOWAT;
1978 2002 struct sendfile_stats sf_stats;
1979 2003 struct sendfile_queue *snfq;
1980 2004 clock_t snfq_timeout;
1981 2005 off64_t sendfile_max_size;
1982 2006
1983 2007 static void snf_enque(snf_req_t *, mblk_t *);
1984 2008 static mblk_t *snf_deque(snf_req_t *);
1985 2009
1986 2010 void
1987 2011 sendfile_init(void)
1988 2012 {
1989 2013 snfq = kmem_zalloc(sizeof (struct sendfile_queue), KM_SLEEP);
1990 2014
1991 2015 mutex_init(&snfq->snfq_lock, NULL, MUTEX_DEFAULT, NULL);
1992 2016 cv_init(&snfq->snfq_cv, NULL, CV_DEFAULT, NULL);
1993 2017 snfq->snfq_max_threads = max_ncpus;
1994 2018 snfq_timeout = SNFQ_TIMEOUT;
1995 2019 /* Cache all files by default. */
1996 2020 sendfile_max_size = MAXOFFSET_T;
1997 2021 }
1998 2022
1999 2023 /*
2000 2024 * Queues a mblk_t for network processing.
2001 2025 */
2002 2026 static void
2003 2027 snf_enque(snf_req_t *sr, mblk_t *mp)
2004 2028 {
2005 2029 mp->b_next = NULL;
2006 2030 mutex_enter(&sr->sr_lock);
2007 2031 if (sr->sr_mp_head == NULL) {
2008 2032 sr->sr_mp_head = sr->sr_mp_tail = mp;
2009 2033 cv_signal(&sr->sr_cv);
2010 2034 } else {
2011 2035 sr->sr_mp_tail->b_next = mp;
2012 2036 sr->sr_mp_tail = mp;
2013 2037 }
2014 2038 sr->sr_qlen += MBLKL(mp);
2015 2039 while ((sr->sr_qlen > sr->sr_hiwat) &&
2016 2040 (sr->sr_write_error == 0)) {
2017 2041 sf_stats.ss_full_waits++;
2018 2042 cv_wait(&sr->sr_cv, &sr->sr_lock);
2019 2043 }
2020 2044 mutex_exit(&sr->sr_lock);
2021 2045 }
2022 2046
2023 2047 /*
2024 2048 * De-queues a mblk_t for network processing.
2025 2049 */
2026 2050 static mblk_t *
2027 2051 snf_deque(snf_req_t *sr)
2028 2052 {
2029 2053 mblk_t *mp;
2030 2054
2031 2055 mutex_enter(&sr->sr_lock);
2032 2056 /*
2033 2057 * If we have encountered an error on read or read is
2034 2058 * completed and no more mblks, return NULL.
2035 2059 * We need to check for NULL sr_mp_head also as
2036 2060 * the reads could have completed and there is
2037 2061 * nothing more to come.
2038 2062 */
2039 2063 if (((sr->sr_read_error & ~SR_READ_DONE) != 0) ||
2040 2064 ((sr->sr_read_error & SR_READ_DONE) &&
2041 2065 sr->sr_mp_head == NULL)) {
2042 2066 mutex_exit(&sr->sr_lock);
2043 2067 return (NULL);
2044 2068 }
2045 2069 /*
2046 2070 * To start with neither SR_READ_DONE is marked nor
2047 2071 * the error is set. When we wake up from cv_wait,
2048 2072 * following are the possibilities :
2049 2073 *
2050 2074 * a) sr_read_error is zero and mblks are queued.
2051 2075 * b) sr_read_error is set to SR_READ_DONE
2052 2076 * and mblks are queued.
2053 2077 * c) sr_read_error is set to SR_READ_DONE
2054 2078 * and no mblks.
2055 2079 * d) sr_read_error is set to some error other
2056 2080 * than SR_READ_DONE.
2057 2081 */
2058 2082
2059 2083 while ((sr->sr_read_error == 0) && (sr->sr_mp_head == NULL)) {
2060 2084 sf_stats.ss_empty_waits++;
2061 2085 cv_wait(&sr->sr_cv, &sr->sr_lock);
2062 2086 }
2063 2087 /* Handle (a) and (b) first - the normal case. */
2064 2088 if (((sr->sr_read_error & ~SR_READ_DONE) == 0) &&
2065 2089 (sr->sr_mp_head != NULL)) {
2066 2090 mp = sr->sr_mp_head;
2067 2091 sr->sr_mp_head = mp->b_next;
2068 2092 sr->sr_qlen -= MBLKL(mp);
2069 2093 if (sr->sr_qlen < sr->sr_lowat)
2070 2094 cv_signal(&sr->sr_cv);
2071 2095 mutex_exit(&sr->sr_lock);
2072 2096 mp->b_next = NULL;
2073 2097 return (mp);
2074 2098 }
2075 2099 /* Handle (c) and (d). */
2076 2100 mutex_exit(&sr->sr_lock);
2077 2101 return (NULL);
2078 2102 }
2079 2103
2080 2104 /*
2081 2105 * Reads data from the filesystem and queues it for network processing.
2082 2106 */
2083 2107 void
2084 2108 snf_async_read(snf_req_t *sr)
2085 2109 {
2086 2110 size_t iosize;
2087 2111 u_offset_t fileoff;
2088 2112 u_offset_t size;
2089 2113 int ret_size;
2090 2114 int error;
2091 2115 file_t *fp;
2092 2116 mblk_t *mp;
2093 2117 struct vnode *vp;
2094 2118 int extra = 0;
2095 2119 int maxblk = 0;
2096 2120 int wroff = 0;
2097 2121 struct sonode *so;
2098 2122
2099 2123 fp = sr->sr_fp;
2100 2124 size = sr->sr_file_size;
2101 2125 fileoff = sr->sr_file_off;
2102 2126
2103 2127 /*
2104 2128 * Ignore the error for filesystems that doesn't support DIRECTIO.
2105 2129 */
2106 2130 (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_ON, 0,
2107 2131 kcred, NULL, NULL);
2108 2132
2109 2133 vp = sr->sr_vp;
2110 2134 if (vp->v_type == VSOCK) {
2111 2135 stdata_t *stp;
2112 2136
2113 2137 /*
2114 2138 * Get the extra space to insert a header and a trailer.
2115 2139 */
2116 2140 so = VTOSO(vp);
2117 2141 stp = vp->v_stream;
2118 2142 if (stp == NULL) {
2119 2143 wroff = so->so_proto_props.sopp_wroff;
2120 2144 maxblk = so->so_proto_props.sopp_maxblk;
2121 2145 extra = wroff + so->so_proto_props.sopp_tail;
2122 2146 } else {
2123 2147 wroff = (int)(stp->sd_wroff);
2124 2148 maxblk = (int)(stp->sd_maxblk);
2125 2149 extra = wroff + (int)(stp->sd_tail);
2126 2150 }
2127 2151 }
2128 2152
2129 2153 while ((size != 0) && (sr->sr_write_error == 0)) {
2130 2154
2131 2155 iosize = (int)MIN(sr->sr_maxpsz, size);
2132 2156
2133 2157 /*
2134 2158 * Socket filters can limit the mblk size,
2135 2159 * so limit reads to maxblk if there are
2136 2160 * filters present.
2137 2161 */
2138 2162 if (vp->v_type == VSOCK &&
2139 2163 so->so_filter_active > 0 && maxblk != INFPSZ)
2140 2164 iosize = (int)MIN(iosize, maxblk);
2141 2165
2142 2166 if (is_system_labeled()) {
2143 2167 mp = allocb_cred(iosize + extra, CRED(),
2144 2168 curproc->p_pid);
2145 2169 } else {
2146 2170 mp = allocb(iosize + extra, BPRI_MED);
2147 2171 }
2148 2172 if (mp == NULL) {
2149 2173 error = EAGAIN;
2150 2174 break;
2151 2175 }
2152 2176
2153 2177 mp->b_rptr += wroff;
2154 2178
2155 2179 ret_size = soreadfile(fp, mp->b_rptr, fileoff, &error, iosize);
2156 2180
2157 2181 /* Error or Reached EOF ? */
2158 2182 if ((error != 0) || (ret_size == 0)) {
2159 2183 freeb(mp);
2160 2184 break;
2161 2185 }
2162 2186 mp->b_wptr = mp->b_rptr + ret_size;
2163 2187
2164 2188 snf_enque(sr, mp);
2165 2189 size -= ret_size;
2166 2190 fileoff += ret_size;
2167 2191 }
2168 2192 (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_OFF, 0,
2169 2193 kcred, NULL, NULL);
2170 2194 mutex_enter(&sr->sr_lock);
2171 2195 sr->sr_read_error = error;
2172 2196 sr->sr_read_error |= SR_READ_DONE;
2173 2197 cv_signal(&sr->sr_cv);
2174 2198 mutex_exit(&sr->sr_lock);
2175 2199 }
2176 2200
2177 2201 void
2178 2202 snf_async_thread(void)
2179 2203 {
2180 2204 snf_req_t *sr;
2181 2205 callb_cpr_t cprinfo;
2182 2206 clock_t time_left = 1;
2183 2207
2184 2208 CALLB_CPR_INIT(&cprinfo, &snfq->snfq_lock, callb_generic_cpr, "snfq");
2185 2209
2186 2210 mutex_enter(&snfq->snfq_lock);
2187 2211 for (;;) {
2188 2212 /*
2189 2213 * If we didn't find a entry, then block until woken up
2190 2214 * again and then look through the queues again.
2191 2215 */
2192 2216 while ((sr = snfq->snfq_req_head) == NULL) {
2193 2217 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2194 2218 if (time_left <= 0) {
2195 2219 snfq->snfq_svc_threads--;
2196 2220 CALLB_CPR_EXIT(&cprinfo);
2197 2221 thread_exit();
2198 2222 /* NOTREACHED */
2199 2223 }
2200 2224 snfq->snfq_idle_cnt++;
2201 2225
2202 2226 time_left = cv_reltimedwait(&snfq->snfq_cv,
2203 2227 &snfq->snfq_lock, snfq_timeout, TR_CLOCK_TICK);
2204 2228 snfq->snfq_idle_cnt--;
2205 2229
2206 2230 CALLB_CPR_SAFE_END(&cprinfo, &snfq->snfq_lock);
2207 2231 }
2208 2232 snfq->snfq_req_head = sr->sr_next;
2209 2233 snfq->snfq_req_cnt--;
2210 2234 mutex_exit(&snfq->snfq_lock);
2211 2235 snf_async_read(sr);
2212 2236 mutex_enter(&snfq->snfq_lock);
2213 2237 }
2214 2238 }
2215 2239
2216 2240
2217 2241 snf_req_t *
2218 2242 create_thread(int operation, struct vnode *vp, file_t *fp,
2219 2243 u_offset_t fileoff, u_offset_t size)
2220 2244 {
2221 2245 snf_req_t *sr;
2222 2246 stdata_t *stp;
2223 2247
2224 2248 sr = (snf_req_t *)kmem_zalloc(sizeof (snf_req_t), KM_SLEEP);
2225 2249
2226 2250 sr->sr_vp = vp;
2227 2251 sr->sr_fp = fp;
2228 2252 stp = vp->v_stream;
2229 2253
2230 2254 /*
2231 2255 * store sd_qn_maxpsz into sr_maxpsz while we have stream head.
2232 2256 * stream might be closed before thread returns from snf_async_read.
2233 2257 */
2234 2258 if (stp != NULL && stp->sd_qn_maxpsz > 0) {
2235 2259 sr->sr_maxpsz = MIN(MAXBSIZE, stp->sd_qn_maxpsz);
2236 2260 } else {
2237 2261 sr->sr_maxpsz = MAXBSIZE;
2238 2262 }
2239 2263
2240 2264 sr->sr_operation = operation;
2241 2265 sr->sr_file_off = fileoff;
2242 2266 sr->sr_file_size = size;
2243 2267 sr->sr_hiwat = sendfile_req_hiwat;
2244 2268 sr->sr_lowat = sendfile_req_lowat;
2245 2269 mutex_init(&sr->sr_lock, NULL, MUTEX_DEFAULT, NULL);
2246 2270 cv_init(&sr->sr_cv, NULL, CV_DEFAULT, NULL);
2247 2271 /*
2248 2272 * See whether we need another thread for servicing this
2249 2273 * request. If there are already enough requests queued
2250 2274 * for the threads, create one if not exceeding
2251 2275 * snfq_max_threads.
2252 2276 */
2253 2277 mutex_enter(&snfq->snfq_lock);
2254 2278 if (snfq->snfq_req_cnt >= snfq->snfq_idle_cnt &&
2255 2279 snfq->snfq_svc_threads < snfq->snfq_max_threads) {
2256 2280 (void) thread_create(NULL, 0, &snf_async_thread, 0, 0, &p0,
2257 2281 TS_RUN, minclsyspri);
2258 2282 snfq->snfq_svc_threads++;
2259 2283 }
2260 2284 if (snfq->snfq_req_head == NULL) {
2261 2285 snfq->snfq_req_head = snfq->snfq_req_tail = sr;
2262 2286 cv_signal(&snfq->snfq_cv);
2263 2287 } else {
2264 2288 snfq->snfq_req_tail->sr_next = sr;
2265 2289 snfq->snfq_req_tail = sr;
2266 2290 }
2267 2291 snfq->snfq_req_cnt++;
2268 2292 mutex_exit(&snfq->snfq_lock);
2269 2293 return (sr);
2270 2294 }
2271 2295
2272 2296 int
2273 2297 snf_direct_io(file_t *fp, file_t *rfp, u_offset_t fileoff, u_offset_t size,
2274 2298 ssize_t *count)
2275 2299 {
2276 2300 snf_req_t *sr;
2277 2301 mblk_t *mp;
2278 2302 int iosize;
2279 2303 int error = 0;
2280 2304 short fflag;
2281 2305 struct vnode *vp;
2282 2306 int ksize;
2283 2307 struct nmsghdr msg;
2284 2308
2285 2309 ksize = 0;
2286 2310 *count = 0;
2287 2311 bzero(&msg, sizeof (msg));
2288 2312
2289 2313 vp = fp->f_vnode;
2290 2314 fflag = fp->f_flag;
2291 2315 if ((sr = create_thread(READ_OP, vp, rfp, fileoff, size)) == NULL)
2292 2316 return (EAGAIN);
2293 2317
2294 2318 /*
2295 2319 * We check for read error in snf_deque. It has to check
2296 2320 * for successful READ_DONE and return NULL, and we might
2297 2321 * as well make an additional check there.
2298 2322 */
2299 2323 while ((mp = snf_deque(sr)) != NULL) {
2300 2324
2301 2325 if (ISSIG(curthread, JUSTLOOKING)) {
2302 2326 freeb(mp);
2303 2327 error = EINTR;
2304 2328 break;
2305 2329 }
2306 2330 iosize = MBLKL(mp);
2307 2331
2308 2332 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2309 2333
2310 2334 if (error != 0) {
2311 2335 if (mp != NULL)
2312 2336 freeb(mp);
2313 2337 break;
2314 2338 }
2315 2339 ksize += iosize;
2316 2340 }
2317 2341 *count = ksize;
2318 2342
2319 2343 mutex_enter(&sr->sr_lock);
2320 2344 sr->sr_write_error = error;
2321 2345 /* Look at the big comments on why we cv_signal here. */
2322 2346 cv_signal(&sr->sr_cv);
2323 2347
2324 2348 /* Wait for the reader to complete always. */
2325 2349 while (!(sr->sr_read_error & SR_READ_DONE)) {
2326 2350 cv_wait(&sr->sr_cv, &sr->sr_lock);
2327 2351 }
2328 2352 /* If there is no write error, check for read error. */
2329 2353 if (error == 0)
2330 2354 error = (sr->sr_read_error & ~SR_READ_DONE);
2331 2355
2332 2356 if (error != 0) {
2333 2357 mblk_t *next_mp;
2334 2358
2335 2359 mp = sr->sr_mp_head;
2336 2360 while (mp != NULL) {
2337 2361 next_mp = mp->b_next;
2338 2362 mp->b_next = NULL;
2339 2363 freeb(mp);
2340 2364 mp = next_mp;
2341 2365 }
2342 2366 }
2343 2367 mutex_exit(&sr->sr_lock);
2344 2368 kmem_free(sr, sizeof (snf_req_t));
2345 2369 return (error);
2346 2370 }
2347 2371
2348 2372 /* Maximum no.of pages allocated by vpm for sendfile at a time */
2349 2373 #define SNF_VPMMAXPGS (VPMMAXPGS/2)
2350 2374
2351 2375 /*
2352 2376 * Maximum no.of elements in the list returned by vpm, including
2353 2377 * NULL for the last entry
2354 2378 */
2355 2379 #define SNF_MAXVMAPS (SNF_VPMMAXPGS + 1)
2356 2380
2357 2381 typedef struct {
2358 2382 unsigned int snfv_ref;
2359 2383 frtn_t snfv_frtn;
2360 2384 vnode_t *snfv_vp;
2361 2385 struct vmap snfv_vml[SNF_MAXVMAPS];
2362 2386 } snf_vmap_desbinfo;
2363 2387
2364 2388 typedef struct {
2365 2389 frtn_t snfi_frtn;
2366 2390 caddr_t snfi_base;
2367 2391 uint_t snfi_mapoff;
2368 2392 size_t snfi_len;
2369 2393 vnode_t *snfi_vp;
2370 2394 } snf_smap_desbinfo;
2371 2395
2372 2396 /*
2373 2397 * The callback function used for vpm mapped mblks called when the last ref of
2374 2398 * the mblk is dropped which normally occurs when TCP receives the ack. But it
2375 2399 * can be the driver too due to lazy reclaim.
2376 2400 */
2377 2401 void
2378 2402 snf_vmap_desbfree(snf_vmap_desbinfo *snfv)
2379 2403 {
2380 2404 ASSERT(snfv->snfv_ref != 0);
2381 2405 if (atomic_add_32_nv(&snfv->snfv_ref, -1) == 0) {
2382 2406 vpm_unmap_pages(snfv->snfv_vml, S_READ);
2383 2407 VN_RELE(snfv->snfv_vp);
2384 2408 kmem_free(snfv, sizeof (snf_vmap_desbinfo));
2385 2409 }
2386 2410 }
2387 2411
2388 2412 /*
2389 2413 * The callback function used for segmap'ped mblks called when the last ref of
2390 2414 * the mblk is dropped which normally occurs when TCP receives the ack. But it
2391 2415 * can be the driver too due to lazy reclaim.
2392 2416 */
2393 2417 void
2394 2418 snf_smap_desbfree(snf_smap_desbinfo *snfi)
2395 2419 {
2396 2420 if (! IS_KPM_ADDR(snfi->snfi_base)) {
2397 2421 /*
2398 2422 * We don't need to call segmap_fault(F_SOFTUNLOCK) for
2399 2423 * segmap_kpm as long as the latter never falls back to
2400 2424 * "use_segmap_range". (See segmap_getmapflt().)
2401 2425 *
2402 2426 * Using S_OTHER saves an redundant hat_setref() in
2403 2427 * segmap_unlock()
2404 2428 */
2405 2429 (void) segmap_fault(kas.a_hat, segkmap,
2406 2430 (caddr_t)(uintptr_t)(((uintptr_t)snfi->snfi_base +
2407 2431 snfi->snfi_mapoff) & PAGEMASK), snfi->snfi_len,
2408 2432 F_SOFTUNLOCK, S_OTHER);
2409 2433 }
2410 2434 (void) segmap_release(segkmap, snfi->snfi_base, SM_DONTNEED);
2411 2435 VN_RELE(snfi->snfi_vp);
2412 2436 kmem_free(snfi, sizeof (*snfi));
2413 2437 }
2414 2438
2415 2439 /*
2416 2440 * Use segmap or vpm instead of bcopy to send down a desballoca'ed, mblk.
2417 2441 * When segmap is used, the mblk contains a segmap slot of no more
2418 2442 * than MAXBSIZE.
2419 2443 *
2420 2444 * With vpm, a maximum of SNF_MAXVMAPS page-sized mappings can be obtained
2421 2445 * in each iteration and sent by socket_sendmblk until an error occurs or
2422 2446 * the requested size has been transferred. An mblk is esballoca'ed from
2423 2447 * each mapped page and a chain of these mblk is sent to the transport layer.
2424 2448 * vpm will be called to unmap the pages when all mblks have been freed by
2425 2449 * free_func.
2426 2450 *
2427 2451 * At the end of the whole sendfile() operation, we wait till the data from
2428 2452 * the last mblk is ack'ed by the transport before returning so that the
2429 2453 * caller of sendfile() can safely modify the file content.
2430 2454 */
2431 2455 int
2432 2456 snf_segmap(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t total_size,
2433 2457 ssize_t *count, boolean_t nowait)
2434 2458 {
2435 2459 caddr_t base;
2436 2460 int mapoff;
2437 2461 vnode_t *vp;
2438 2462 mblk_t *mp = NULL;
2439 2463 int chain_size;
2440 2464 int error;
2441 2465 clock_t deadlk_wait;
2442 2466 short fflag;
2443 2467 int ksize;
2444 2468 struct vattr va;
2445 2469 boolean_t dowait = B_FALSE;
2446 2470 struct nmsghdr msg;
2447 2471
2448 2472 vp = fp->f_vnode;
2449 2473 fflag = fp->f_flag;
2450 2474 ksize = 0;
2451 2475 bzero(&msg, sizeof (msg));
2452 2476
2453 2477 for (;;) {
2454 2478 if (ISSIG(curthread, JUSTLOOKING)) {
2455 2479 error = EINTR;
2456 2480 break;
2457 2481 }
2458 2482
2459 2483 if (vpm_enable) {
2460 2484 snf_vmap_desbinfo *snfv;
2461 2485 mblk_t *nmp;
2462 2486 int mblk_size;
2463 2487 int maxsize;
2464 2488 int i;
2465 2489
2466 2490 mapoff = fileoff & PAGEOFFSET;
2467 2491 maxsize = MIN((SNF_VPMMAXPGS * PAGESIZE), total_size);
2468 2492
2469 2493 snfv = kmem_zalloc(sizeof (snf_vmap_desbinfo),
2470 2494 KM_SLEEP);
2471 2495
2472 2496 /*
2473 2497 * Get vpm mappings for maxsize with read access.
2474 2498 * If the pages aren't available yet, we get
2475 2499 * DEADLK, so wait and try again a little later using
2476 2500 * an increasing wait. We might be here a long time.
2477 2501 *
2478 2502 * If delay_sig returns EINTR, be sure to exit and
2479 2503 * pass it up to the caller.
2480 2504 */
2481 2505 deadlk_wait = 0;
2482 2506 while ((error = vpm_map_pages(fvp, fileoff,
2483 2507 (size_t)maxsize, (VPM_FETCHPAGE), snfv->snfv_vml,
2484 2508 SNF_MAXVMAPS, NULL, S_READ)) == EDEADLK) {
2485 2509 deadlk_wait += (deadlk_wait < 5) ? 1 : 4;
2486 2510 if ((error = delay_sig(deadlk_wait)) != 0) {
2487 2511 break;
2488 2512 }
2489 2513 }
2490 2514 if (error != 0) {
2491 2515 kmem_free(snfv, sizeof (snf_vmap_desbinfo));
2492 2516 error = (error == EINTR) ? EINTR : EIO;
2493 2517 goto out;
2494 2518 }
2495 2519 snfv->snfv_frtn.free_func = snf_vmap_desbfree;
2496 2520 snfv->snfv_frtn.free_arg = (caddr_t)snfv;
2497 2521
2498 2522 /* Construct the mblk chain from the page mappings */
2499 2523 chain_size = 0;
2500 2524 for (i = 0; (snfv->snfv_vml[i].vs_addr != NULL) &&
2501 2525 total_size > 0; i++) {
2502 2526 ASSERT(chain_size < maxsize);
2503 2527 mblk_size = MIN(snfv->snfv_vml[i].vs_len -
2504 2528 mapoff, total_size);
2505 2529 nmp = esballoca(
2506 2530 (uchar_t *)snfv->snfv_vml[i].vs_addr +
2507 2531 mapoff, mblk_size, BPRI_HI,
2508 2532 &snfv->snfv_frtn);
2509 2533
2510 2534 /*
2511 2535 * We return EAGAIN after unmapping the pages
2512 2536 * if we cannot allocate the the head of the
2513 2537 * chain. Otherwise, we continue sending the
2514 2538 * mblks constructed so far.
2515 2539 */
2516 2540 if (nmp == NULL) {
2517 2541 if (i == 0) {
2518 2542 vpm_unmap_pages(snfv->snfv_vml,
2519 2543 S_READ);
2520 2544 kmem_free(snfv,
2521 2545 sizeof (snf_vmap_desbinfo));
2522 2546 error = EAGAIN;
2523 2547 goto out;
2524 2548 }
2525 2549 break;
2526 2550 }
2527 2551 /* Mark this dblk with the zero-copy flag */
2528 2552 nmp->b_datap->db_struioflag |= STRUIO_ZC;
2529 2553 nmp->b_wptr += mblk_size;
2530 2554 chain_size += mblk_size;
2531 2555 fileoff += mblk_size;
2532 2556 total_size -= mblk_size;
2533 2557 snfv->snfv_ref++;
2534 2558 mapoff = 0;
2535 2559 if (i > 0)
2536 2560 linkb(mp, nmp);
2537 2561 else
2538 2562 mp = nmp;
2539 2563 }
2540 2564 VN_HOLD(fvp);
2541 2565 snfv->snfv_vp = fvp;
2542 2566 } else {
2543 2567 /* vpm not supported. fallback to segmap */
2544 2568 snf_smap_desbinfo *snfi;
2545 2569
2546 2570 mapoff = fileoff & MAXBOFFSET;
2547 2571 chain_size = MAXBSIZE - mapoff;
2548 2572 if (chain_size > total_size)
2549 2573 chain_size = total_size;
2550 2574 /*
2551 2575 * we don't forcefault because we'll call
2552 2576 * segmap_fault(F_SOFTLOCK) next.
2553 2577 *
2554 2578 * S_READ will get the ref bit set (by either
2555 2579 * segmap_getmapflt() or segmap_fault()) and page
2556 2580 * shared locked.
2557 2581 */
2558 2582 base = segmap_getmapflt(segkmap, fvp, fileoff,
2559 2583 chain_size, segmap_kpm ? SM_FAULT : 0, S_READ);
2560 2584
2561 2585 snfi = kmem_alloc(sizeof (*snfi), KM_SLEEP);
2562 2586 snfi->snfi_len = (size_t)roundup(mapoff+chain_size,
2563 2587 PAGESIZE)- (mapoff & PAGEMASK);
2564 2588 /*
2565 2589 * We must call segmap_fault() even for segmap_kpm
2566 2590 * because that's how error gets returned.
2567 2591 * (segmap_getmapflt() never fails but segmap_fault()
2568 2592 * does.)
2569 2593 *
2570 2594 * If the pages aren't available yet, we get
2571 2595 * DEADLK, so wait and try again a little later using
2572 2596 * an increasing wait. We might be here a long time.
2573 2597 *
2574 2598 * If delay_sig returns EINTR, be sure to exit and
2575 2599 * pass it up to the caller.
2576 2600 */
2577 2601 deadlk_wait = 0;
2578 2602 while ((error = FC_ERRNO(segmap_fault(kas.a_hat,
2579 2603 segkmap, (caddr_t)(uintptr_t)(((uintptr_t)base +
2580 2604 mapoff) & PAGEMASK), snfi->snfi_len, F_SOFTLOCK,
2581 2605 S_READ))) == EDEADLK) {
2582 2606 deadlk_wait += (deadlk_wait < 5) ? 1 : 4;
2583 2607 if ((error = delay_sig(deadlk_wait)) != 0) {
2584 2608 break;
2585 2609 }
2586 2610 }
2587 2611 if (error != 0) {
2588 2612 (void) segmap_release(segkmap, base, 0);
2589 2613 kmem_free(snfi, sizeof (*snfi));
2590 2614 error = (error == EINTR) ? EINTR : EIO;
2591 2615 goto out;
2592 2616 }
2593 2617 snfi->snfi_frtn.free_func = snf_smap_desbfree;
2594 2618 snfi->snfi_frtn.free_arg = (caddr_t)snfi;
2595 2619 snfi->snfi_base = base;
2596 2620 snfi->snfi_mapoff = mapoff;
2597 2621 mp = esballoca((uchar_t *)base + mapoff, chain_size,
2598 2622 BPRI_HI, &snfi->snfi_frtn);
2599 2623
2600 2624 if (mp == NULL) {
2601 2625 (void) segmap_fault(kas.a_hat, segkmap,
2602 2626 (caddr_t)(uintptr_t)(((uintptr_t)base +
2603 2627 mapoff) & PAGEMASK), snfi->snfi_len,
2604 2628 F_SOFTUNLOCK, S_OTHER);
2605 2629 (void) segmap_release(segkmap, base, 0);
2606 2630 kmem_free(snfi, sizeof (*snfi));
2607 2631 freemsg(mp);
2608 2632 error = EAGAIN;
2609 2633 goto out;
2610 2634 }
2611 2635 VN_HOLD(fvp);
2612 2636 snfi->snfi_vp = fvp;
2613 2637 mp->b_wptr += chain_size;
2614 2638
2615 2639 /* Mark this dblk with the zero-copy flag */
2616 2640 mp->b_datap->db_struioflag |= STRUIO_ZC;
2617 2641 fileoff += chain_size;
2618 2642 total_size -= chain_size;
2619 2643 }
2620 2644
2621 2645 if (total_size == 0 && !nowait) {
2622 2646 ASSERT(!dowait);
2623 2647 dowait = B_TRUE;
2624 2648 mp->b_datap->db_struioflag |= STRUIO_ZCNOTIFY;
2625 2649 }
2626 2650 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2627 2651 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2628 2652 if (error != 0) {
2629 2653 /*
2630 2654 * mp contains the mblks that were not sent by
2631 2655 * socket_sendmblk. Use its size to update *count
2632 2656 */
2633 2657 *count = ksize + (chain_size - msgdsize(mp));
2634 2658 if (mp != NULL)
2635 2659 freemsg(mp);
2636 2660 return (error);
2637 2661 }
2638 2662 ksize += chain_size;
2639 2663 if (total_size == 0)
2640 2664 goto done;
2641 2665
2642 2666 (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2643 2667 va.va_mask = AT_SIZE;
2644 2668 error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2645 2669 if (error)
2646 2670 break;
2647 2671 /* Read as much as possible. */
2648 2672 if (fileoff >= va.va_size)
2649 2673 break;
2650 2674 if (total_size + fileoff > va.va_size)
2651 2675 total_size = va.va_size - fileoff;
2652 2676 }
2653 2677 out:
2654 2678 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2655 2679 done:
2656 2680 *count = ksize;
2657 2681 if (dowait) {
2658 2682 stdata_t *stp;
2659 2683
2660 2684 stp = vp->v_stream;
2661 2685 if (stp == NULL) {
2662 2686 struct sonode *so;
2663 2687 so = VTOSO(vp);
2664 2688 error = so_zcopy_wait(so);
2665 2689 } else {
2666 2690 mutex_enter(&stp->sd_lock);
2667 2691 while (!(stp->sd_flag & STZCNOTIFY)) {
2668 2692 if (cv_wait_sig(&stp->sd_zcopy_wait,
2669 2693 &stp->sd_lock) == 0) {
2670 2694 error = EINTR;
2671 2695 break;
2672 2696 }
2673 2697 }
2674 2698 stp->sd_flag &= ~STZCNOTIFY;
2675 2699 mutex_exit(&stp->sd_lock);
2676 2700 }
2677 2701 }
2678 2702 return (error);
2679 2703 }
2680 2704
2681 2705 int
2682 2706 snf_cache(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size,
2683 2707 uint_t maxpsz, ssize_t *count)
2684 2708 {
2685 2709 struct vnode *vp;
2686 2710 mblk_t *mp;
2687 2711 int iosize;
2688 2712 int extra = 0;
2689 2713 int error;
2690 2714 short fflag;
2691 2715 int ksize;
2692 2716 int ioflag;
2693 2717 struct uio auio;
2694 2718 struct iovec aiov;
2695 2719 struct vattr va;
2696 2720 int maxblk = 0;
2697 2721 int wroff = 0;
2698 2722 struct sonode *so;
2699 2723 struct nmsghdr msg;
2700 2724
2701 2725 vp = fp->f_vnode;
2702 2726 if (vp->v_type == VSOCK) {
2703 2727 stdata_t *stp;
2704 2728
2705 2729 /*
2706 2730 * Get the extra space to insert a header and a trailer.
2707 2731 */
2708 2732 so = VTOSO(vp);
2709 2733 stp = vp->v_stream;
2710 2734 if (stp == NULL) {
2711 2735 wroff = so->so_proto_props.sopp_wroff;
2712 2736 maxblk = so->so_proto_props.sopp_maxblk;
2713 2737 extra = wroff + so->so_proto_props.sopp_tail;
2714 2738 } else {
2715 2739 wroff = (int)(stp->sd_wroff);
2716 2740 maxblk = (int)(stp->sd_maxblk);
2717 2741 extra = wroff + (int)(stp->sd_tail);
2718 2742 }
2719 2743 }
2720 2744 bzero(&msg, sizeof (msg));
2721 2745 fflag = fp->f_flag;
2722 2746 ksize = 0;
2723 2747 auio.uio_iov = &aiov;
2724 2748 auio.uio_iovcnt = 1;
2725 2749 auio.uio_segflg = UIO_SYSSPACE;
2726 2750 auio.uio_llimit = MAXOFFSET_T;
2727 2751 auio.uio_fmode = fflag;
2728 2752 auio.uio_extflg = UIO_COPY_CACHED;
2729 2753 ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
2730 2754 /* If read sync is not asked for, filter sync flags */
2731 2755 if ((ioflag & FRSYNC) == 0)
2732 2756 ioflag &= ~(FSYNC|FDSYNC);
2733 2757 for (;;) {
2734 2758 if (ISSIG(curthread, JUSTLOOKING)) {
2735 2759 error = EINTR;
2736 2760 break;
2737 2761 }
2738 2762 iosize = (int)MIN(maxpsz, size);
2739 2763
2740 2764 /*
2741 2765 * Socket filters can limit the mblk size,
2742 2766 * so limit reads to maxblk if there are
2743 2767 * filters present.
2744 2768 */
2745 2769 if (vp->v_type == VSOCK &&
2746 2770 so->so_filter_active > 0 && maxblk != INFPSZ)
2747 2771 iosize = (int)MIN(iosize, maxblk);
2748 2772
2749 2773 if (is_system_labeled()) {
2750 2774 mp = allocb_cred(iosize + extra, CRED(),
2751 2775 curproc->p_pid);
2752 2776 } else {
2753 2777 mp = allocb(iosize + extra, BPRI_MED);
2754 2778 }
2755 2779 if (mp == NULL) {
2756 2780 error = EAGAIN;
2757 2781 break;
2758 2782 }
2759 2783
2760 2784 mp->b_rptr += wroff;
2761 2785
2762 2786 aiov.iov_base = (caddr_t)mp->b_rptr;
2763 2787 aiov.iov_len = iosize;
2764 2788 auio.uio_loffset = fileoff;
2765 2789 auio.uio_resid = iosize;
2766 2790
2767 2791 error = VOP_READ(fvp, &auio, ioflag, fp->f_cred, NULL);
2768 2792 iosize -= auio.uio_resid;
2769 2793
2770 2794 if (error == EINTR && iosize != 0)
2771 2795 error = 0;
2772 2796
2773 2797 if (error != 0 || iosize == 0) {
2774 2798 freeb(mp);
2775 2799 break;
2776 2800 }
2777 2801 mp->b_wptr = mp->b_rptr + iosize;
2778 2802
2779 2803 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2780 2804
2781 2805 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2782 2806
2783 2807 if (error != 0) {
2784 2808 *count = ksize;
2785 2809 if (mp != NULL)
2786 2810 freeb(mp);
2787 2811 return (error);
2788 2812 }
2789 2813 ksize += iosize;
2790 2814 size -= iosize;
2791 2815 if (size == 0)
2792 2816 goto done;
2793 2817
2794 2818 fileoff += iosize;
2795 2819 (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2796 2820 va.va_mask = AT_SIZE;
2797 2821 error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2798 2822 if (error)
2799 2823 break;
2800 2824 /* Read as much as possible. */
2801 2825 if (fileoff >= va.va_size)
2802 2826 size = 0;
2803 2827 else if (size + fileoff > va.va_size)
2804 2828 size = va.va_size - fileoff;
2805 2829 }
2806 2830 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2807 2831 done:
2808 2832 *count = ksize;
2809 2833 return (error);
2810 2834 }
2811 2835
2812 2836 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
2813 2837 /*
2814 2838 * Largefile support for 32 bit applications only.
2815 2839 */
2816 2840 int
2817 2841 sosendfile64(file_t *fp, file_t *rfp, const struct ksendfilevec64 *sfv,
2818 2842 ssize32_t *count32)
2819 2843 {
2820 2844 ssize32_t sfv_len;
2821 2845 u_offset_t sfv_off, va_size;
2822 2846 struct vnode *vp, *fvp, *realvp;
2823 2847 struct vattr va;
2824 2848 stdata_t *stp;
2825 2849 ssize_t count = 0;
2826 2850 int error = 0;
2827 2851 boolean_t dozcopy = B_FALSE;
2828 2852 uint_t maxpsz;
2829 2853
2830 2854 sfv_len = (ssize32_t)sfv->sfv_len;
2831 2855 if (sfv_len < 0) {
2832 2856 error = EINVAL;
2833 2857 goto out;
2834 2858 }
2835 2859
2836 2860 if (sfv_len == 0) goto out;
2837 2861
2838 2862 sfv_off = (u_offset_t)sfv->sfv_off;
2839 2863
2840 2864 /* Same checks as in pread */
2841 2865 if (sfv_off > MAXOFFSET_T) {
2842 2866 error = EINVAL;
2843 2867 goto out;
2844 2868 }
2845 2869 if (sfv_off + sfv_len > MAXOFFSET_T)
2846 2870 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
2847 2871
2848 2872 /*
2849 2873 * There are no more checks on sfv_len. So, we cast it to
2850 2874 * u_offset_t and share the snf_direct_io/snf_cache code between
2851 2875 * 32 bit and 64 bit.
2852 2876 *
2853 2877 * TODO: should do nbl_need_check() like read()?
2854 2878 */
2855 2879 if (sfv_len > sendfile_max_size) {
2856 2880 sf_stats.ss_file_not_cached++;
2857 2881 error = snf_direct_io(fp, rfp, sfv_off, (u_offset_t)sfv_len,
2858 2882 &count);
2859 2883 goto out;
2860 2884 }
2861 2885 fvp = rfp->f_vnode;
2862 2886 if (VOP_REALVP(fvp, &realvp, NULL) == 0)
2863 2887 fvp = realvp;
2864 2888 /*
2865 2889 * Grab the lock as a reader to prevent the file size
2866 2890 * from changing underneath.
2867 2891 */
2868 2892 (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2869 2893 va.va_mask = AT_SIZE;
2870 2894 error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2871 2895 va_size = va.va_size;
2872 2896 if ((error != 0) || (va_size == 0) || (sfv_off >= va_size)) {
2873 2897 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2874 2898 goto out;
2875 2899 }
2876 2900 /* Read as much as possible. */
2877 2901 if (sfv_off + sfv_len > va_size)
2878 2902 sfv_len = va_size - sfv_off;
2879 2903
2880 2904 vp = fp->f_vnode;
2881 2905 stp = vp->v_stream;
2882 2906 /*
2883 2907 * When the NOWAIT flag is not set, we enable zero-copy only if the
2884 2908 * transfer size is large enough. This prevents performance loss
2885 2909 * when the caller sends the file piece by piece.
2886 2910 */
2887 2911 if (sfv_len >= MAXBSIZE && (sfv_len >= (va_size >> 1) ||
2888 2912 (sfv->sfv_flag & SFV_NOWAIT) || sfv_len >= 0x1000000) &&
2889 2913 !vn_has_flocks(fvp) && !(fvp->v_flag & VNOMAP)) {
2890 2914 uint_t copyflag;
2891 2915 copyflag = stp != NULL ? stp->sd_copyflag :
2892 2916 VTOSO(vp)->so_proto_props.sopp_zcopyflag;
2893 2917 if ((copyflag & (STZCVMSAFE|STZCVMUNSAFE)) == 0) {
2894 2918 int on = 1;
2895 2919
2896 2920 if (socket_setsockopt(VTOSO(vp), SOL_SOCKET,
2897 2921 SO_SND_COPYAVOID, &on, sizeof (on), CRED()) == 0)
2898 2922 dozcopy = B_TRUE;
2899 2923 } else {
2900 2924 dozcopy = copyflag & STZCVMSAFE;
2901 2925 }
2902 2926 }
2903 2927 if (dozcopy) {
2904 2928 sf_stats.ss_file_segmap++;
2905 2929 error = snf_segmap(fp, fvp, sfv_off, (u_offset_t)sfv_len,
2906 2930 &count, ((sfv->sfv_flag & SFV_NOWAIT) != 0));
2907 2931 } else {
2908 2932 if (vp->v_type == VSOCK && stp == NULL) {
2909 2933 sonode_t *so = VTOSO(vp);
2910 2934 maxpsz = so->so_proto_props.sopp_maxpsz;
2911 2935 } else if (stp != NULL) {
2912 2936 maxpsz = stp->sd_qn_maxpsz;
2913 2937 } else {
2914 2938 maxpsz = maxphys;
2915 2939 }
2916 2940
2917 2941 if (maxpsz == INFPSZ)
2918 2942 maxpsz = maxphys;
2919 2943 else
2920 2944 maxpsz = roundup(maxpsz, MAXBSIZE);
2921 2945 sf_stats.ss_file_cached++;
2922 2946 error = snf_cache(fp, fvp, sfv_off, (u_offset_t)sfv_len,
2923 2947 maxpsz, &count);
2924 2948 }
2925 2949 out:
2926 2950 releasef(sfv->sfv_fd);
2927 2951 *count32 = (ssize32_t)count;
2928 2952 return (error);
2929 2953 }
2930 2954 #endif
2931 2955
2932 2956 #ifdef _SYSCALL32_IMPL
2933 2957 /*
2934 2958 * recv32(), recvfrom32(), send32(), sendto32(): intentionally return a
2935 2959 * ssize_t rather than ssize32_t; see the comments above read32 for details.
2936 2960 */
2937 2961
2938 2962 ssize_t
2939 2963 recv32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags)
2940 2964 {
2941 2965 return (recv(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags));
2942 2966 }
2943 2967
2944 2968 ssize_t
2945 2969 recvfrom32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags,
2946 2970 caddr32_t name, caddr32_t namelenp)
2947 2971 {
2948 2972 return (recvfrom(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags,
2949 2973 (void *)(uintptr_t)name, (void *)(uintptr_t)namelenp));
2950 2974 }
2951 2975
2952 2976 ssize_t
2953 2977 send32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags)
2954 2978 {
2955 2979 return (send(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags));
2956 2980 }
2957 2981
2958 2982 ssize_t
2959 2983 sendto32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags,
2960 2984 caddr32_t name, socklen_t namelen)
2961 2985 {
2962 2986 return (sendto(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags,
2963 2987 (void *)(uintptr_t)name, namelen));
2964 2988 }
2965 2989 #endif /* _SYSCALL32_IMPL */
2966 2990
2967 2991 /*
2968 2992 * Function wrappers (mostly around the sonode switch) for
2969 2993 * backward compatibility.
2970 2994 */
2971 2995
2972 2996 int
2973 2997 soaccept(struct sonode *so, int fflag, struct sonode **nsop)
2974 2998 {
2975 2999 return (socket_accept(so, fflag, CRED(), nsop));
2976 3000 }
2977 3001
2978 3002 int
2979 3003 sobind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
2980 3004 int backlog, int flags)
2981 3005 {
2982 3006 int error;
2983 3007
2984 3008 error = socket_bind(so, name, namelen, flags, CRED());
2985 3009 if (error == 0 && backlog != 0)
2986 3010 return (socket_listen(so, backlog, CRED()));
2987 3011
2988 3012 return (error);
2989 3013 }
2990 3014
2991 3015 int
2992 3016 solisten(struct sonode *so, int backlog)
2993 3017 {
2994 3018 return (socket_listen(so, backlog, CRED()));
2995 3019 }
2996 3020
2997 3021 int
2998 3022 soconnect(struct sonode *so, struct sockaddr *name, socklen_t namelen,
2999 3023 int fflag, int flags)
3000 3024 {
3001 3025 return (socket_connect(so, name, namelen, fflag, flags, CRED()));
3002 3026 }
3003 3027
3004 3028 int
3005 3029 sorecvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
3006 3030 {
3007 3031 return (socket_recvmsg(so, msg, uiop, CRED()));
3008 3032 }
3009 3033
3010 3034 int
3011 3035 sosendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
3012 3036 {
3013 3037 return (socket_sendmsg(so, msg, uiop, CRED()));
3014 3038 }
3015 3039
3016 3040 int
3017 3041 soshutdown(struct sonode *so, int how)
3018 3042 {
3019 3043 return (socket_shutdown(so, how, CRED()));
3020 3044 }
3021 3045
3022 3046 int
3023 3047 sogetsockopt(struct sonode *so, int level, int option_name, void *optval,
3024 3048 socklen_t *optlenp, int flags)
3025 3049 {
3026 3050 return (socket_getsockopt(so, level, option_name, optval, optlenp,
3027 3051 flags, CRED()));
3028 3052 }
3029 3053
3030 3054 int
3031 3055 sosetsockopt(struct sonode *so, int level, int option_name, const void *optval,
3032 3056 t_uscalar_t optlen)
3033 3057 {
3034 3058 return (socket_setsockopt(so, level, option_name, optval, optlen,
3035 3059 CRED()));
3036 3060 }
3037 3061
3038 3062 /*
3039 3063 * Because this is backward compatibility interface it only needs to be
3040 3064 * able to handle the creation of TPI sockfs sockets.
3041 3065 */
3042 3066 struct sonode *
3043 3067 socreate(struct sockparams *sp, int family, int type, int protocol, int version,
3044 3068 int *errorp)
3045 3069 {
3046 3070 struct sonode *so;
3047 3071
3048 3072 ASSERT(sp != NULL);
3049 3073
3050 3074 so = sp->sp_smod_info->smod_sock_create_func(sp, family, type, protocol,
3051 3075 version, SOCKET_SLEEP, errorp, CRED());
3052 3076 if (so == NULL) {
3053 3077 SOCKPARAMS_DEC_REF(sp);
3054 3078 } else {
3055 3079 if ((*errorp = SOP_INIT(so, NULL, CRED(), SOCKET_SLEEP)) == 0) {
3056 3080 /* Cannot fail, only bumps so_count */
3057 3081 (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, CRED(), NULL);
3058 3082 } else {
3059 3083 socket_destroy(so);
3060 3084 so = NULL;
3061 3085 }
3062 3086 }
3063 3087 return (so);
3064 3088 }
↓ open down ↓ |
2551 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX