Print this page
5880 Increase IOV_MAX to at least 1024
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Split |
Close |
Expand all |
Collapse all |
--- old/usr/src/uts/common/fs/sockfs/socksyscalls.c
+++ new/usr/src/uts/common/fs/sockfs/socksyscalls.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
↓ open down ↓ |
13 lines elided |
↑ open up ↑ |
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24 - */
25 -
26 -/* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */
27 -/*
24 + * Copyright 2015, Joyent, Inc. All rights reserved.
25 + * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
28 26 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
29 27 */
30 28
31 29 #include <sys/types.h>
32 30 #include <sys/t_lock.h>
33 31 #include <sys/param.h>
34 32 #include <sys/systm.h>
35 33 #include <sys/buf.h>
36 34 #include <sys/conf.h>
37 35 #include <sys/cred.h>
38 36 #include <sys/kmem.h>
39 37 #include <sys/sysmacros.h>
40 38 #include <sys/vfs.h>
41 39 #include <sys/vnode.h>
42 40 #include <sys/debug.h>
43 41 #include <sys/errno.h>
44 42 #include <sys/time.h>
45 43 #include <sys/file.h>
46 44 #include <sys/user.h>
↓ open down ↓ |
9 lines elided |
↑ open up ↑ |
47 45 #include <sys/stream.h>
48 46 #include <sys/strsubr.h>
49 47 #include <sys/strsun.h>
50 48 #include <sys/sunddi.h>
51 49 #include <sys/esunddi.h>
52 50 #include <sys/flock.h>
53 51 #include <sys/modctl.h>
54 52 #include <sys/cmn_err.h>
55 53 #include <sys/vmsystm.h>
56 54 #include <sys/policy.h>
55 +#include <sys/limits.h>
57 56
58 57 #include <sys/socket.h>
59 58 #include <sys/socketvar.h>
60 59
61 60 #include <sys/isa_defs.h>
62 61 #include <sys/inttypes.h>
63 62 #include <sys/systm.h>
64 63 #include <sys/cpuvar.h>
65 64 #include <sys/filio.h>
66 65 #include <sys/sendfile.h>
67 66 #include <sys/ddi.h>
68 67 #include <vm/seg.h>
69 68 #include <vm/seg_map.h>
70 69 #include <vm/seg_kpm.h>
71 70
72 71 #include <fs/sockfs/nl7c.h>
73 72 #include <fs/sockfs/sockcommon.h>
74 73 #include <fs/sockfs/sockfilter_impl.h>
75 74 #include <fs/sockfs/socktpi.h>
76 75
77 76 #ifdef SOCK_TEST
78 77 int do_useracc = 1; /* Controlled by setting SO_DEBUG to 4 */
↓ open down ↓ |
12 lines elided |
↑ open up ↑ |
79 78 #else
80 79 #define do_useracc 1
81 80 #endif /* SOCK_TEST */
82 81
83 82 extern int xnet_truncate_print;
84 83
85 84 extern void nl7c_init(void);
86 85 extern int sockfs_defer_nl7c_init;
87 86
88 87 /*
89 - * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c"
90 - * as there isn't a formal definition of IOV_MAX ???
91 - */
92 -#define MSG_MAXIOVLEN 16
93 -
94 -/*
95 88 * Kernel component of socket creation.
96 89 *
97 90 * The socket library determines which version number to use.
98 91 * First the library calls this with a NULL devpath. If this fails
99 92 * to find a transport (using solookup) the library will look in /etc/netconfig
100 93 * for the appropriate transport. If one is found it will pass in the
101 94 * devpath for the kernel to use.
102 95 */
103 96 int
104 97 so_socket(int family, int type_w_flags, int protocol, char *devpath,
105 98 int version)
106 99 {
107 100 struct sonode *so;
108 101 vnode_t *vp;
109 102 struct file *fp;
110 103 int fd;
111 104 int error;
112 105 int type;
113 106
114 107 type = type_w_flags & SOCK_TYPE_MASK;
115 108 type_w_flags &= ~SOCK_TYPE_MASK;
116 109 if (type_w_flags & ~(SOCK_CLOEXEC|SOCK_NDELAY|SOCK_NONBLOCK))
117 110 return (set_errno(EINVAL));
118 111
119 112 if (devpath != NULL) {
120 113 char *buf;
121 114 size_t kdevpathlen = 0;
122 115
123 116 buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
124 117 if ((error = copyinstr(devpath, buf,
125 118 MAXPATHLEN, &kdevpathlen)) != 0) {
126 119 kmem_free(buf, MAXPATHLEN);
127 120 return (set_errno(error));
128 121 }
129 122 so = socket_create(family, type, protocol, buf, NULL,
130 123 SOCKET_SLEEP, version, CRED(), &error);
131 124 kmem_free(buf, MAXPATHLEN);
132 125 } else {
133 126 so = socket_create(family, type, protocol, NULL, NULL,
134 127 SOCKET_SLEEP, version, CRED(), &error);
135 128 }
136 129 if (so == NULL)
137 130 return (set_errno(error));
138 131
139 132 /* Allocate a file descriptor for the socket */
140 133 vp = SOTOV(so);
141 134 if (error = falloc(vp, FWRITE|FREAD, &fp, &fd)) {
142 135 (void) socket_close(so, 0, CRED());
143 136 socket_destroy(so);
144 137 return (set_errno(error));
145 138 }
146 139
147 140 /*
148 141 * Now fill in the entries that falloc reserved
149 142 */
150 143 if (type_w_flags & SOCK_NDELAY) {
151 144 so->so_state |= SS_NDELAY;
152 145 fp->f_flag |= FNDELAY;
153 146 }
154 147 if (type_w_flags & SOCK_NONBLOCK) {
155 148 so->so_state |= SS_NONBLOCK;
156 149 fp->f_flag |= FNONBLOCK;
157 150 }
158 151 mutex_exit(&fp->f_tlock);
159 152 setf(fd, fp);
160 153 if ((type_w_flags & SOCK_CLOEXEC) != 0) {
161 154 f_setfd(fd, FD_CLOEXEC);
162 155 }
163 156
164 157 return (fd);
165 158 }
166 159
167 160 /*
168 161 * Map from a file descriptor to a socket node.
169 162 * Returns with the file descriptor held i.e. the caller has to
170 163 * use releasef when done with the file descriptor.
171 164 */
172 165 struct sonode *
173 166 getsonode(int sock, int *errorp, file_t **fpp)
174 167 {
175 168 file_t *fp;
176 169 vnode_t *vp;
177 170 struct sonode *so;
178 171
179 172 if ((fp = getf(sock)) == NULL) {
180 173 *errorp = EBADF;
181 174 eprintline(*errorp);
182 175 return (NULL);
183 176 }
184 177 vp = fp->f_vnode;
185 178 /* Check if it is a socket */
186 179 if (vp->v_type != VSOCK) {
187 180 releasef(sock);
188 181 *errorp = ENOTSOCK;
189 182 eprintline(*errorp);
190 183 return (NULL);
191 184 }
192 185 /*
193 186 * Use the stream head to find the real socket vnode.
194 187 * This is needed when namefs sits above sockfs.
195 188 */
196 189 if (vp->v_stream) {
197 190 ASSERT(vp->v_stream->sd_vnode);
198 191 vp = vp->v_stream->sd_vnode;
199 192
200 193 so = VTOSO(vp);
201 194 if (so->so_version == SOV_STREAM) {
202 195 releasef(sock);
203 196 *errorp = ENOTSOCK;
204 197 eprintsoline(so, *errorp);
205 198 return (NULL);
206 199 }
207 200 } else {
208 201 so = VTOSO(vp);
209 202 }
210 203 if (fpp)
211 204 *fpp = fp;
212 205 return (so);
213 206 }
214 207
215 208 /*
216 209 * Allocate and copyin a sockaddr.
217 210 * Ensures NULL termination for AF_UNIX addresses by extending them
218 211 * with one NULL byte if need be. Verifies that the length is not
219 212 * excessive to prevent an application from consuming all of kernel
220 213 * memory. Returns NULL when an error occurred.
221 214 */
222 215 static struct sockaddr *
223 216 copyin_name(struct sonode *so, struct sockaddr *name, socklen_t *namelenp,
224 217 int *errorp)
225 218 {
226 219 char *faddr;
227 220 size_t namelen = (size_t)*namelenp;
228 221
229 222 ASSERT(namelen != 0);
230 223 if (namelen > SO_MAXARGSIZE) {
231 224 *errorp = EINVAL;
232 225 eprintsoline(so, *errorp);
233 226 return (NULL);
234 227 }
235 228
236 229 faddr = (char *)kmem_alloc(namelen, KM_SLEEP);
237 230 if (copyin(name, faddr, namelen)) {
238 231 kmem_free(faddr, namelen);
239 232 *errorp = EFAULT;
240 233 eprintsoline(so, *errorp);
241 234 return (NULL);
242 235 }
243 236
244 237 /*
245 238 * Add space for NULL termination if needed.
246 239 * Do a quick check if the last byte is NUL.
247 240 */
248 241 if (so->so_family == AF_UNIX && faddr[namelen - 1] != '\0') {
249 242 /* Check if there is any NULL termination */
250 243 size_t i;
251 244 int foundnull = 0;
252 245
253 246 for (i = sizeof (name->sa_family); i < namelen; i++) {
254 247 if (faddr[i] == '\0') {
255 248 foundnull = 1;
256 249 break;
257 250 }
258 251 }
259 252 if (!foundnull) {
260 253 /* Add extra byte for NUL padding */
261 254 char *nfaddr;
262 255
263 256 nfaddr = (char *)kmem_alloc(namelen + 1, KM_SLEEP);
264 257 bcopy(faddr, nfaddr, namelen);
265 258 kmem_free(faddr, namelen);
266 259
267 260 /* NUL terminate */
268 261 nfaddr[namelen] = '\0';
269 262 namelen++;
270 263 ASSERT((socklen_t)namelen == namelen);
271 264 *namelenp = (socklen_t)namelen;
272 265 faddr = nfaddr;
273 266 }
274 267 }
275 268 return ((struct sockaddr *)faddr);
276 269 }
277 270
278 271 /*
279 272 * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
280 273 */
281 274 static int
282 275 copyout_arg(void *uaddr, socklen_t ulen, void *ulenp, void *kaddr,
283 276 socklen_t klen)
284 277 {
285 278 if (uaddr != NULL) {
286 279 if (ulen > klen)
287 280 ulen = klen;
288 281
289 282 if (ulen != 0) {
290 283 if (copyout(kaddr, uaddr, ulen))
291 284 return (EFAULT);
292 285 }
293 286 } else
294 287 ulen = 0;
295 288
296 289 if (ulenp != NULL) {
297 290 if (copyout(&ulen, ulenp, sizeof (ulen)))
298 291 return (EFAULT);
299 292 }
300 293 return (0);
301 294 }
302 295
303 296 /*
304 297 * Copy from kaddr/klen to uaddr/ulen. Updates ulenp if non-NULL.
305 298 * If klen is greater than ulen it still uses the non-truncated
306 299 * klen to update ulenp.
307 300 */
308 301 static int
309 302 copyout_name(void *uaddr, socklen_t ulen, void *ulenp, void *kaddr,
310 303 socklen_t klen)
311 304 {
312 305 if (uaddr != NULL) {
313 306 if (ulen >= klen)
314 307 ulen = klen;
315 308 else if (ulen != 0 && xnet_truncate_print) {
316 309 printf("sockfs: truncating copyout of address using "
317 310 "XNET semantics for pid = %d. Lengths %d, %d\n",
318 311 curproc->p_pid, klen, ulen);
319 312 }
320 313
321 314 if (ulen != 0) {
322 315 if (copyout(kaddr, uaddr, ulen))
323 316 return (EFAULT);
324 317 } else
325 318 klen = 0;
326 319 } else
327 320 klen = 0;
328 321
329 322 if (ulenp != NULL) {
330 323 if (copyout(&klen, ulenp, sizeof (klen)))
331 324 return (EFAULT);
332 325 }
333 326 return (0);
334 327 }
335 328
336 329 /*
337 330 * The socketpair() code in libsocket creates two sockets (using
338 331 * the /etc/netconfig fallback if needed) before calling this routine
339 332 * to connect the two sockets together.
340 333 *
341 334 * For a SOCK_STREAM socketpair a listener is needed - in that case this
342 335 * routine will create a new file descriptor as part of accepting the
343 336 * connection. The library socketpair() will check if svs[2] has changed
344 337 * in which case it will close the changed fd.
345 338 *
346 339 * Note that this code could use the TPI feature of accepting the connection
347 340 * on the listening endpoint. However, that would require significant changes
348 341 * to soaccept.
349 342 */
350 343 int
351 344 so_socketpair(int sv[2])
352 345 {
353 346 int svs[2];
354 347 struct sonode *so1, *so2;
355 348 int error;
356 349 int orig_flags;
357 350 struct sockaddr_ux *name;
358 351 size_t namelen;
359 352 sotpi_info_t *sti1;
360 353 sotpi_info_t *sti2;
361 354
362 355 dprint(1, ("so_socketpair(%p)\n", (void *)sv));
363 356
364 357 error = useracc(sv, sizeof (svs), B_WRITE);
365 358 if (error && do_useracc)
366 359 return (set_errno(EFAULT));
367 360
368 361 if (copyin(sv, svs, sizeof (svs)))
369 362 return (set_errno(EFAULT));
370 363
371 364 if ((so1 = getsonode(svs[0], &error, NULL)) == NULL)
372 365 return (set_errno(error));
373 366
374 367 if ((so2 = getsonode(svs[1], &error, NULL)) == NULL) {
375 368 releasef(svs[0]);
376 369 return (set_errno(error));
377 370 }
378 371
379 372 if (so1->so_family != AF_UNIX || so2->so_family != AF_UNIX) {
380 373 error = EOPNOTSUPP;
381 374 goto done;
382 375 }
383 376
384 377 sti1 = SOTOTPI(so1);
385 378 sti2 = SOTOTPI(so2);
386 379
387 380 /*
388 381 * The code below makes assumptions about the "sockfs" implementation.
389 382 * So make sure that the correct implementation is really used.
390 383 */
391 384 ASSERT(so1->so_ops == &sotpi_sonodeops);
392 385 ASSERT(so2->so_ops == &sotpi_sonodeops);
393 386
394 387 if (so1->so_type == SOCK_DGRAM) {
395 388 /*
396 389 * Bind both sockets and connect them with each other.
397 390 * Need to allocate name/namelen for soconnect.
398 391 */
399 392 error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC, CRED());
400 393 if (error) {
401 394 eprintsoline(so1, error);
402 395 goto done;
403 396 }
404 397 error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED());
405 398 if (error) {
406 399 eprintsoline(so2, error);
407 400 goto done;
408 401 }
409 402 namelen = sizeof (struct sockaddr_ux);
410 403 name = kmem_alloc(namelen, KM_SLEEP);
411 404 name->sou_family = AF_UNIX;
412 405 name->sou_addr = sti2->sti_ux_laddr;
413 406 error = socket_connect(so1,
414 407 (struct sockaddr *)name,
415 408 (socklen_t)namelen,
416 409 0, _SOCONNECT_NOXLATE, CRED());
417 410 if (error) {
418 411 kmem_free(name, namelen);
419 412 eprintsoline(so1, error);
420 413 goto done;
421 414 }
422 415 name->sou_addr = sti1->sti_ux_laddr;
423 416 error = socket_connect(so2,
424 417 (struct sockaddr *)name,
425 418 (socklen_t)namelen,
426 419 0, _SOCONNECT_NOXLATE, CRED());
427 420 kmem_free(name, namelen);
428 421 if (error) {
429 422 eprintsoline(so2, error);
430 423 goto done;
431 424 }
432 425 releasef(svs[0]);
433 426 releasef(svs[1]);
434 427 } else {
435 428 /*
436 429 * Bind both sockets, with so1 being a listener.
437 430 * Connect so2 to so1 - nonblocking to avoid waiting for
438 431 * soaccept to complete.
439 432 * Accept a connection on so1. Pass out the new fd as sv[0].
440 433 * The library will detect the changed fd and close
441 434 * the original one.
442 435 */
443 436 struct sonode *nso;
444 437 struct vnode *nvp;
445 438 struct file *nfp;
446 439 int nfd;
447 440
448 441 /*
449 442 * We could simply call socket_listen() here (which would do the
450 443 * binding automatically) if the code didn't rely on passing
451 444 * _SOBIND_NOXLATE to the TPI implementation of socket_bind().
452 445 */
453 446 error = socket_bind(so1, NULL, 0, _SOBIND_UNSPEC|
454 447 _SOBIND_NOXLATE|_SOBIND_LISTEN|_SOBIND_SOCKETPAIR,
455 448 CRED());
456 449 if (error) {
457 450 eprintsoline(so1, error);
458 451 goto done;
459 452 }
460 453 error = socket_bind(so2, NULL, 0, _SOBIND_UNSPEC, CRED());
461 454 if (error) {
462 455 eprintsoline(so2, error);
463 456 goto done;
464 457 }
465 458
466 459 namelen = sizeof (struct sockaddr_ux);
467 460 name = kmem_alloc(namelen, KM_SLEEP);
468 461 name->sou_family = AF_UNIX;
469 462 name->sou_addr = sti1->sti_ux_laddr;
470 463 error = socket_connect(so2,
471 464 (struct sockaddr *)name,
472 465 (socklen_t)namelen,
473 466 FNONBLOCK, _SOCONNECT_NOXLATE, CRED());
474 467 kmem_free(name, namelen);
475 468 if (error) {
476 469 if (error != EINPROGRESS) {
477 470 eprintsoline(so2, error); goto done;
478 471 }
479 472 }
480 473
481 474 error = socket_accept(so1, 0, CRED(), &nso);
482 475 if (error) {
483 476 eprintsoline(so1, error);
484 477 goto done;
485 478 }
486 479
487 480 /* wait for so2 being SS_CONNECTED ignoring signals */
488 481 mutex_enter(&so2->so_lock);
489 482 error = sowaitconnected(so2, 0, 1);
490 483 mutex_exit(&so2->so_lock);
491 484 if (error != 0) {
492 485 (void) socket_close(nso, 0, CRED());
493 486 socket_destroy(nso);
494 487 eprintsoline(so2, error);
495 488 goto done;
496 489 }
497 490
498 491 nvp = SOTOV(nso);
499 492 if (error = falloc(nvp, FWRITE|FREAD, &nfp, &nfd)) {
500 493 (void) socket_close(nso, 0, CRED());
501 494 socket_destroy(nso);
502 495 eprintsoline(nso, error);
503 496 goto done;
504 497 }
505 498 /*
506 499 * copy over FNONBLOCK and FNDELAY flags should they exist
507 500 */
508 501 if (so1->so_state & SS_NONBLOCK)
509 502 nfp->f_flag |= FNONBLOCK;
510 503 if (so1->so_state & SS_NDELAY)
511 504 nfp->f_flag |= FNDELAY;
512 505
513 506 /*
514 507 * fill in the entries that falloc reserved
515 508 */
516 509 mutex_exit(&nfp->f_tlock);
517 510 setf(nfd, nfp);
518 511
519 512 /*
520 513 * get the original flags before we release
521 514 */
522 515 VERIFY(f_getfd_error(svs[0], &orig_flags) == 0);
523 516
524 517 releasef(svs[0]);
525 518 releasef(svs[1]);
526 519
527 520 /*
528 521 * If FD_CLOEXEC was set on the filedescriptor we're
529 522 * swapping out, we should set it on the new one too.
530 523 */
531 524 if (orig_flags & FD_CLOEXEC) {
532 525 f_setfd(nfd, FD_CLOEXEC);
533 526 }
534 527
535 528 /*
536 529 * The socketpair library routine will close the original
537 530 * svs[0] when this code passes out a different file
538 531 * descriptor.
539 532 */
540 533 svs[0] = nfd;
541 534
542 535 if (copyout(svs, sv, sizeof (svs))) {
543 536 (void) closeandsetf(nfd, NULL);
544 537 eprintline(EFAULT);
545 538 return (set_errno(EFAULT));
546 539 }
547 540 }
548 541 return (0);
549 542
550 543 done:
551 544 releasef(svs[0]);
552 545 releasef(svs[1]);
553 546 return (set_errno(error));
554 547 }
555 548
556 549 int
557 550 bind(int sock, struct sockaddr *name, socklen_t namelen, int version)
558 551 {
559 552 struct sonode *so;
560 553 int error;
561 554
562 555 dprint(1, ("bind(%d, %p, %d)\n",
563 556 sock, (void *)name, namelen));
564 557
565 558 if ((so = getsonode(sock, &error, NULL)) == NULL)
566 559 return (set_errno(error));
567 560
568 561 /* Allocate and copyin name */
569 562 /*
570 563 * X/Open test does not expect EFAULT with NULL name and non-zero
571 564 * namelen.
572 565 */
573 566 if (name != NULL && namelen != 0) {
574 567 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
575 568 name = copyin_name(so, name, &namelen, &error);
576 569 if (name == NULL) {
577 570 releasef(sock);
578 571 return (set_errno(error));
579 572 }
580 573 } else {
581 574 name = NULL;
582 575 namelen = 0;
583 576 }
584 577
585 578 switch (version) {
586 579 default:
587 580 error = socket_bind(so, name, namelen, 0, CRED());
588 581 break;
589 582 case SOV_XPG4_2:
590 583 error = socket_bind(so, name, namelen, _SOBIND_XPG4_2, CRED());
591 584 break;
592 585 case SOV_SOCKBSD:
593 586 error = socket_bind(so, name, namelen, _SOBIND_SOCKBSD, CRED());
594 587 break;
595 588 }
596 589 done:
597 590 releasef(sock);
598 591 if (name != NULL)
599 592 kmem_free(name, (size_t)namelen);
600 593
601 594 if (error)
602 595 return (set_errno(error));
603 596 return (0);
604 597 }
605 598
606 599 /* ARGSUSED2 */
607 600 int
608 601 listen(int sock, int backlog, int version)
609 602 {
610 603 struct sonode *so;
611 604 int error;
612 605
613 606 dprint(1, ("listen(%d, %d)\n",
614 607 sock, backlog));
615 608
616 609 if ((so = getsonode(sock, &error, NULL)) == NULL)
617 610 return (set_errno(error));
618 611
619 612 error = socket_listen(so, backlog, CRED());
620 613
621 614 releasef(sock);
622 615 if (error)
623 616 return (set_errno(error));
624 617 return (0);
625 618 }
626 619
627 620 /*ARGSUSED3*/
628 621 int
629 622 accept(int sock, struct sockaddr *name, socklen_t *namelenp, int version,
630 623 int flags)
631 624 {
632 625 struct sonode *so;
633 626 file_t *fp;
634 627 int error;
635 628 socklen_t namelen;
636 629 struct sonode *nso;
637 630 struct vnode *nvp;
638 631 struct file *nfp;
639 632 int nfd;
640 633 int ssflags;
641 634 struct sockaddr *addrp;
642 635 socklen_t addrlen;
643 636
644 637 dprint(1, ("accept(%d, %p, %p)\n",
645 638 sock, (void *)name, (void *)namelenp));
646 639
647 640 if (flags & ~(SOCK_CLOEXEC|SOCK_NONBLOCK|SOCK_NDELAY)) {
648 641 return (set_errno(EINVAL));
649 642 }
650 643
651 644 /* Translate SOCK_ flags to their SS_ variant */
652 645 ssflags = 0;
653 646 if (flags & SOCK_NONBLOCK)
654 647 ssflags |= SS_NONBLOCK;
655 648 if (flags & SOCK_NDELAY)
656 649 ssflags |= SS_NDELAY;
657 650
658 651 if ((so = getsonode(sock, &error, &fp)) == NULL)
659 652 return (set_errno(error));
660 653
661 654 if (name != NULL) {
662 655 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
663 656 if (copyin(namelenp, &namelen, sizeof (namelen))) {
664 657 releasef(sock);
665 658 return (set_errno(EFAULT));
666 659 }
667 660 if (namelen != 0) {
668 661 error = useracc(name, (size_t)namelen, B_WRITE);
669 662 if (error && do_useracc) {
670 663 releasef(sock);
671 664 return (set_errno(EFAULT));
672 665 }
673 666 } else
674 667 name = NULL;
675 668 } else {
676 669 namelen = 0;
677 670 }
678 671
679 672 /*
680 673 * Allocate the user fd before socket_accept() in order to
681 674 * catch EMFILE errors before calling socket_accept().
682 675 */
683 676 if ((nfd = ufalloc(0)) == -1) {
684 677 eprintsoline(so, EMFILE);
685 678 releasef(sock);
686 679 return (set_errno(EMFILE));
687 680 }
688 681 error = socket_accept(so, fp->f_flag, CRED(), &nso);
689 682 if (error) {
690 683 setf(nfd, NULL);
691 684 releasef(sock);
692 685 return (set_errno(error));
693 686 }
694 687
695 688 nvp = SOTOV(nso);
696 689
697 690 ASSERT(MUTEX_NOT_HELD(&nso->so_lock));
698 691 if (namelen != 0) {
699 692 addrlen = so->so_max_addr_len;
700 693 addrp = (struct sockaddr *)kmem_alloc(addrlen, KM_SLEEP);
701 694
702 695 if ((error = socket_getpeername(nso, (struct sockaddr *)addrp,
703 696 &addrlen, B_TRUE, CRED())) == 0) {
704 697 error = copyout_name(name, namelen, namelenp,
705 698 addrp, addrlen);
706 699 } else {
707 700 ASSERT(error == EINVAL || error == ENOTCONN);
708 701 error = ECONNABORTED;
709 702 }
710 703 kmem_free(addrp, so->so_max_addr_len);
711 704 }
712 705
713 706 if (error) {
714 707 setf(nfd, NULL);
715 708 (void) socket_close(nso, 0, CRED());
716 709 socket_destroy(nso);
717 710 releasef(sock);
718 711 return (set_errno(error));
719 712 }
720 713 if (error = falloc(NULL, FWRITE|FREAD, &nfp, NULL)) {
721 714 setf(nfd, NULL);
722 715 (void) socket_close(nso, 0, CRED());
723 716 socket_destroy(nso);
724 717 eprintsoline(so, error);
725 718 releasef(sock);
726 719 return (set_errno(error));
727 720 }
728 721 /*
729 722 * fill in the entries that falloc reserved
730 723 */
731 724 nfp->f_vnode = nvp;
732 725 mutex_exit(&nfp->f_tlock);
733 726 setf(nfd, nfp);
734 727
735 728 /*
736 729 * Act on SOCK_CLOEXEC from flags
737 730 */
738 731 if (flags & SOCK_CLOEXEC) {
739 732 f_setfd(nfd, FD_CLOEXEC);
740 733 }
741 734
742 735 /*
743 736 * Copy FNDELAY and FNONBLOCK from listener to acceptor
744 737 * and from ssflags
745 738 */
746 739 if ((ssflags | so->so_state) & (SS_NDELAY|SS_NONBLOCK)) {
747 740 uint_t oflag = nfp->f_flag;
748 741 int arg = 0;
749 742
750 743 if ((ssflags | so->so_state) & SS_NONBLOCK)
751 744 arg |= FNONBLOCK;
752 745 else if ((ssflags | so->so_state) & SS_NDELAY)
753 746 arg |= FNDELAY;
754 747
755 748 /*
756 749 * This code is a simplification of the F_SETFL code in fcntl()
757 750 * Ignore any errors from VOP_SETFL.
758 751 */
759 752 if ((error = VOP_SETFL(nvp, oflag, arg, nfp->f_cred, NULL))
760 753 != 0) {
761 754 eprintsoline(so, error);
762 755 error = 0;
763 756 } else {
764 757 mutex_enter(&nfp->f_tlock);
765 758 nfp->f_flag &= ~FMASK | (FREAD|FWRITE);
766 759 nfp->f_flag |= arg;
767 760 mutex_exit(&nfp->f_tlock);
768 761 }
769 762 }
770 763 releasef(sock);
771 764 return (nfd);
772 765 }
773 766
774 767 int
775 768 connect(int sock, struct sockaddr *name, socklen_t namelen, int version)
776 769 {
777 770 struct sonode *so;
778 771 file_t *fp;
779 772 int error;
780 773
781 774 dprint(1, ("connect(%d, %p, %d)\n",
782 775 sock, (void *)name, namelen));
783 776
784 777 if ((so = getsonode(sock, &error, &fp)) == NULL)
785 778 return (set_errno(error));
786 779
787 780 /* Allocate and copyin name */
788 781 if (namelen != 0) {
789 782 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
790 783 name = copyin_name(so, name, &namelen, &error);
791 784 if (name == NULL) {
792 785 releasef(sock);
793 786 return (set_errno(error));
794 787 }
795 788 } else
796 789 name = NULL;
797 790
798 791 error = socket_connect(so, name, namelen, fp->f_flag,
799 792 (version != SOV_XPG4_2) ? 0 : _SOCONNECT_XPG4_2, CRED());
800 793 releasef(sock);
801 794 if (name)
802 795 kmem_free(name, (size_t)namelen);
803 796 if (error)
804 797 return (set_errno(error));
805 798 return (0);
806 799 }
807 800
808 801 /*ARGSUSED2*/
809 802 int
810 803 shutdown(int sock, int how, int version)
811 804 {
812 805 struct sonode *so;
813 806 int error;
814 807
815 808 dprint(1, ("shutdown(%d, %d)\n",
816 809 sock, how));
817 810
818 811 if ((so = getsonode(sock, &error, NULL)) == NULL)
819 812 return (set_errno(error));
820 813
821 814 error = socket_shutdown(so, how, CRED());
822 815
823 816 releasef(sock);
824 817 if (error)
825 818 return (set_errno(error));
826 819 return (0);
827 820 }
828 821
829 822 /*
830 823 * Common receive routine.
831 824 */
832 825 static ssize_t
833 826 recvit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags,
834 827 socklen_t *namelenp, socklen_t *controllenp, int *flagsp)
835 828 {
836 829 struct sonode *so;
837 830 file_t *fp;
838 831 void *name;
839 832 socklen_t namelen;
840 833 void *control;
841 834 socklen_t controllen;
842 835 ssize_t len;
843 836 int error;
844 837
845 838 if ((so = getsonode(sock, &error, &fp)) == NULL)
846 839 return (set_errno(error));
847 840
848 841 len = uiop->uio_resid;
849 842 uiop->uio_fmode = fp->f_flag;
850 843 uiop->uio_extflg = UIO_COPY_CACHED;
851 844
852 845 name = msg->msg_name;
853 846 namelen = msg->msg_namelen;
854 847 control = msg->msg_control;
855 848 controllen = msg->msg_controllen;
856 849
857 850 msg->msg_flags = flags & (MSG_OOB | MSG_PEEK | MSG_WAITALL |
858 851 MSG_DONTWAIT | MSG_XPG4_2);
859 852
860 853 error = socket_recvmsg(so, msg, uiop, CRED());
861 854 if (error) {
862 855 releasef(sock);
863 856 return (set_errno(error));
864 857 }
865 858 lwp_stat_update(LWP_STAT_MSGRCV, 1);
866 859 releasef(sock);
867 860
868 861 error = copyout_name(name, namelen, namelenp,
869 862 msg->msg_name, msg->msg_namelen);
870 863 if (error)
871 864 goto err;
872 865
873 866 if (flagsp != NULL) {
874 867 /*
875 868 * Clear internal flag.
876 869 */
877 870 msg->msg_flags &= ~MSG_XPG4_2;
878 871
879 872 /*
880 873 * Determine MSG_CTRUNC. sorecvmsg sets MSG_CTRUNC only
881 874 * when controllen is zero and there is control data to
882 875 * copy out.
883 876 */
884 877 if (controllen != 0 &&
885 878 (msg->msg_controllen > controllen || control == NULL)) {
886 879 dprint(1, ("recvit: CTRUNC %d %d %p\n",
887 880 msg->msg_controllen, controllen, control));
888 881
889 882 msg->msg_flags |= MSG_CTRUNC;
890 883 }
891 884 if (copyout(&msg->msg_flags, flagsp,
892 885 sizeof (msg->msg_flags))) {
893 886 error = EFAULT;
894 887 goto err;
895 888 }
896 889 }
897 890 /*
898 891 * Note: This MUST be done last. There can be no "goto err" after this
899 892 * point since it could make so_closefds run twice on some part
900 893 * of the file descriptor array.
901 894 */
902 895 if (controllen != 0) {
903 896 if (!(flags & MSG_XPG4_2)) {
904 897 /*
905 898 * Good old msg_accrights can only return a multiple
906 899 * of 4 bytes.
907 900 */
908 901 controllen &= ~((int)sizeof (uint32_t) - 1);
909 902 }
910 903 error = copyout_arg(control, controllen, controllenp,
911 904 msg->msg_control, msg->msg_controllen);
912 905 if (error)
913 906 goto err;
914 907
915 908 if (msg->msg_controllen > controllen || control == NULL) {
916 909 if (control == NULL)
917 910 controllen = 0;
918 911 so_closefds(msg->msg_control, msg->msg_controllen,
919 912 !(flags & MSG_XPG4_2), controllen);
920 913 }
921 914 }
922 915 if (msg->msg_namelen != 0)
923 916 kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
924 917 if (msg->msg_controllen != 0)
925 918 kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
926 919 return (len - uiop->uio_resid);
927 920
928 921 err:
929 922 /*
930 923 * If we fail and the control part contains file descriptors
931 924 * we have to close the fd's.
932 925 */
933 926 if (msg->msg_controllen != 0)
934 927 so_closefds(msg->msg_control, msg->msg_controllen,
935 928 !(flags & MSG_XPG4_2), 0);
936 929 if (msg->msg_namelen != 0)
937 930 kmem_free(msg->msg_name, (size_t)msg->msg_namelen);
938 931 if (msg->msg_controllen != 0)
939 932 kmem_free(msg->msg_control, (size_t)msg->msg_controllen);
940 933 return (set_errno(error));
941 934 }
942 935
943 936 /*
944 937 * Native system call
945 938 */
946 939 ssize_t
947 940 recv(int sock, void *buffer, size_t len, int flags)
948 941 {
949 942 struct nmsghdr lmsg;
950 943 struct uio auio;
951 944 struct iovec aiov[1];
952 945
953 946 dprint(1, ("recv(%d, %p, %ld, %d)\n",
954 947 sock, buffer, len, flags));
955 948
956 949 if ((ssize_t)len < 0) {
957 950 return (set_errno(EINVAL));
958 951 }
959 952
960 953 aiov[0].iov_base = buffer;
961 954 aiov[0].iov_len = len;
962 955 auio.uio_loffset = 0;
963 956 auio.uio_iov = aiov;
964 957 auio.uio_iovcnt = 1;
965 958 auio.uio_resid = len;
966 959 auio.uio_segflg = UIO_USERSPACE;
967 960 auio.uio_limit = 0;
968 961
969 962 lmsg.msg_namelen = 0;
970 963 lmsg.msg_controllen = 0;
971 964 lmsg.msg_flags = 0;
972 965 return (recvit(sock, &lmsg, &auio, flags, NULL, NULL, NULL));
973 966 }
974 967
975 968 ssize_t
976 969 recvfrom(int sock, void *buffer, size_t len, int flags, struct sockaddr *name,
977 970 socklen_t *namelenp)
978 971 {
979 972 struct nmsghdr lmsg;
980 973 struct uio auio;
981 974 struct iovec aiov[1];
982 975
983 976 dprint(1, ("recvfrom(%d, %p, %ld, %d, %p, %p)\n",
984 977 sock, buffer, len, flags, (void *)name, (void *)namelenp));
985 978
986 979 if ((ssize_t)len < 0) {
987 980 return (set_errno(EINVAL));
988 981 }
989 982
990 983 aiov[0].iov_base = buffer;
991 984 aiov[0].iov_len = len;
992 985 auio.uio_loffset = 0;
993 986 auio.uio_iov = aiov;
994 987 auio.uio_iovcnt = 1;
995 988 auio.uio_resid = len;
996 989 auio.uio_segflg = UIO_USERSPACE;
997 990 auio.uio_limit = 0;
998 991
999 992 lmsg.msg_name = (char *)name;
1000 993 if (namelenp != NULL) {
1001 994 if (copyin(namelenp, &lmsg.msg_namelen,
1002 995 sizeof (lmsg.msg_namelen)))
1003 996 return (set_errno(EFAULT));
1004 997 } else {
1005 998 lmsg.msg_namelen = 0;
1006 999 }
1007 1000 lmsg.msg_controllen = 0;
1008 1001 lmsg.msg_flags = 0;
1009 1002
1010 1003 return (recvit(sock, &lmsg, &auio, flags, namelenp, NULL, NULL));
1011 1004 }
1012 1005
1013 1006 /*
↓ open down ↓ |
909 lines elided |
↑ open up ↑ |
1014 1007 * Uses the MSG_XPG4_2 flag to determine if the caller is using
1015 1008 * struct omsghdr or struct nmsghdr.
1016 1009 */
1017 1010 ssize_t
1018 1011 recvmsg(int sock, struct nmsghdr *msg, int flags)
1019 1012 {
1020 1013 STRUCT_DECL(nmsghdr, u_lmsg);
1021 1014 STRUCT_HANDLE(nmsghdr, umsgptr);
1022 1015 struct nmsghdr lmsg;
1023 1016 struct uio auio;
1024 - struct iovec aiov[MSG_MAXIOVLEN];
1017 + struct iovec buf[IOV_MAX_STACK], *aiov = buf;
1018 + ssize_t iovsize = 0;
1025 1019 int iovcnt;
1026 - ssize_t len;
1020 + ssize_t len, rval;
1027 1021 int i;
1028 1022 int *flagsp;
1029 1023 model_t model;
1030 1024
1031 1025 dprint(1, ("recvmsg(%d, %p, %d)\n",
1032 1026 sock, (void *)msg, flags));
1033 1027
1034 1028 model = get_udatamodel();
1035 1029 STRUCT_INIT(u_lmsg, model);
1036 1030 STRUCT_SET_HANDLE(umsgptr, model, msg);
1037 1031
1038 1032 if (flags & MSG_XPG4_2) {
1039 1033 if (copyin(msg, STRUCT_BUF(u_lmsg), STRUCT_SIZE(u_lmsg)))
1040 1034 return (set_errno(EFAULT));
1041 1035 flagsp = STRUCT_FADDR(umsgptr, msg_flags);
1042 1036 } else {
1043 1037 /*
1044 1038 * Assumes that nmsghdr and omsghdr are identically shaped
1045 1039 * except for the added msg_flags field.
1046 1040 */
1047 1041 if (copyin(msg, STRUCT_BUF(u_lmsg),
1048 1042 SIZEOF_STRUCT(omsghdr, model)))
1049 1043 return (set_errno(EFAULT));
1050 1044 STRUCT_FSET(u_lmsg, msg_flags, 0);
1051 1045 flagsp = NULL;
1052 1046 }
1053 1047
1054 1048 /*
1055 1049 * Code below us will kmem_alloc memory and hang it
1056 1050 * off msg_control and msg_name fields. This forces
1057 1051 * us to copy the structure to its native form.
1058 1052 */
↓ open down ↓ |
22 lines elided |
↑ open up ↑ |
1059 1053 lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name);
1060 1054 lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen);
1061 1055 lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov);
1062 1056 lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen);
1063 1057 lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control);
1064 1058 lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen);
1065 1059 lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags);
1066 1060
1067 1061 iovcnt = lmsg.msg_iovlen;
1068 1062
1069 - if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
1063 + if (iovcnt <= 0 || iovcnt > IOV_MAX) {
1070 1064 return (set_errno(EMSGSIZE));
1071 1065 }
1072 1066
1067 + if (iovcnt > IOV_MAX_STACK) {
1068 + iovsize = iovcnt * sizeof (struct iovec);
1069 + aiov = kmem_alloc(iovsize, KM_SLEEP);
1070 + }
1071 +
1073 1072 #ifdef _SYSCALL32_IMPL
1074 1073 /*
1075 1074 * 32-bit callers need to have their iovec expanded, while ensuring
1076 1075 * that they can't move more than 2Gbytes of data in a single call.
1077 1076 */
1078 1077 if (model == DATAMODEL_ILP32) {
1079 - struct iovec32 aiov32[MSG_MAXIOVLEN];
1078 + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
1079 + ssize_t iov32size;
1080 1080 ssize32_t count32;
1081 1081
1082 - if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
1083 - iovcnt * sizeof (struct iovec32)))
1082 + iov32size = iovcnt * sizeof (struct iovec32);
1083 + if (iovsize != 0)
1084 + aiov32 = kmem_alloc(iov32size, KM_SLEEP);
1085 +
1086 + if (copyin((struct iovec32 *)lmsg.msg_iov, aiov32, iov32size)) {
1087 + if (iovsize != 0) {
1088 + kmem_free(aiov32, iov32size);
1089 + kmem_free(aiov, iovsize);
1090 + }
1091 +
1084 1092 return (set_errno(EFAULT));
1093 + }
1085 1094
1086 1095 count32 = 0;
1087 1096 for (i = 0; i < iovcnt; i++) {
1088 1097 ssize32_t iovlen32;
1089 1098
1090 1099 iovlen32 = aiov32[i].iov_len;
1091 1100 count32 += iovlen32;
1092 - if (iovlen32 < 0 || count32 < 0)
1101 + if (iovlen32 < 0 || count32 < 0) {
1102 + if (iovsize != 0) {
1103 + kmem_free(aiov32, iov32size);
1104 + kmem_free(aiov, iovsize);
1105 + }
1106 +
1093 1107 return (set_errno(EINVAL));
1108 + }
1109 +
1094 1110 aiov[i].iov_len = iovlen32;
1095 1111 aiov[i].iov_base =
1096 1112 (caddr_t)(uintptr_t)aiov32[i].iov_base;
1097 1113 }
1114 +
1115 + if (iovsize != 0)
1116 + kmem_free(aiov32, iov32size);
1098 1117 } else
1099 1118 #endif /* _SYSCALL32_IMPL */
1100 1119 if (copyin(lmsg.msg_iov, aiov, iovcnt * sizeof (struct iovec))) {
1120 + if (iovsize != 0)
1121 + kmem_free(aiov, iovsize);
1122 +
1101 1123 return (set_errno(EFAULT));
1102 1124 }
1103 1125 len = 0;
1104 1126 for (i = 0; i < iovcnt; i++) {
1105 1127 ssize_t iovlen = aiov[i].iov_len;
1106 1128 len += iovlen;
1107 1129 if (iovlen < 0 || len < 0) {
1130 + if (iovsize != 0)
1131 + kmem_free(aiov, iovsize);
1132 +
1108 1133 return (set_errno(EINVAL));
1109 1134 }
1110 1135 }
1111 1136 auio.uio_loffset = 0;
1112 1137 auio.uio_iov = aiov;
1113 1138 auio.uio_iovcnt = iovcnt;
1114 1139 auio.uio_resid = len;
1115 1140 auio.uio_segflg = UIO_USERSPACE;
1116 1141 auio.uio_limit = 0;
1117 1142
1118 1143 if (lmsg.msg_control != NULL &&
1119 1144 (do_useracc == 0 ||
1120 1145 useracc(lmsg.msg_control, lmsg.msg_controllen,
1121 1146 B_WRITE) != 0)) {
1147 + if (iovsize != 0)
1148 + kmem_free(aiov, iovsize);
1149 +
1122 1150 return (set_errno(EFAULT));
1123 1151 }
1124 1152
1125 - return (recvit(sock, &lmsg, &auio, flags,
1153 + rval = recvit(sock, &lmsg, &auio, flags,
1126 1154 STRUCT_FADDR(umsgptr, msg_namelen),
1127 - STRUCT_FADDR(umsgptr, msg_controllen), flagsp));
1155 + STRUCT_FADDR(umsgptr, msg_controllen), flagsp);
1156 +
1157 + if (iovsize != 0)
1158 + kmem_free(aiov, iovsize);
1159 +
1160 + return (rval);
1128 1161 }
1129 1162
1130 1163 /*
1131 1164 * Common send function.
1132 1165 */
1133 1166 static ssize_t
1134 1167 sendit(int sock, struct nmsghdr *msg, struct uio *uiop, int flags)
1135 1168 {
1136 1169 struct sonode *so;
1137 1170 file_t *fp;
1138 1171 void *name;
1139 1172 socklen_t namelen;
1140 1173 void *control;
1141 1174 socklen_t controllen;
1142 1175 ssize_t len;
1143 1176 int error;
1144 1177
1145 1178 if ((so = getsonode(sock, &error, &fp)) == NULL)
1146 1179 return (set_errno(error));
1147 1180
1148 1181 uiop->uio_fmode = fp->f_flag;
1149 1182
1150 1183 if (so->so_family == AF_UNIX)
1151 1184 uiop->uio_extflg = UIO_COPY_CACHED;
1152 1185 else
1153 1186 uiop->uio_extflg = UIO_COPY_DEFAULT;
1154 1187
1155 1188 /* Allocate and copyin name and control */
1156 1189 name = msg->msg_name;
1157 1190 namelen = msg->msg_namelen;
1158 1191 if (name != NULL && namelen != 0) {
1159 1192 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1160 1193 name = copyin_name(so,
1161 1194 (struct sockaddr *)name,
1162 1195 &namelen, &error);
1163 1196 if (name == NULL)
1164 1197 goto done3;
1165 1198 /* copyin_name null terminates addresses for AF_UNIX */
1166 1199 msg->msg_namelen = namelen;
1167 1200 msg->msg_name = name;
1168 1201 } else {
1169 1202 msg->msg_name = name = NULL;
1170 1203 msg->msg_namelen = namelen = 0;
1171 1204 }
1172 1205
1173 1206 control = msg->msg_control;
1174 1207 controllen = msg->msg_controllen;
1175 1208 if ((control != NULL) && (controllen != 0)) {
1176 1209 /*
1177 1210 * Verify that the length is not excessive to prevent
1178 1211 * an application from consuming all of kernel memory.
1179 1212 */
1180 1213 if (controllen > SO_MAXARGSIZE) {
1181 1214 error = EINVAL;
1182 1215 goto done2;
1183 1216 }
1184 1217 control = kmem_alloc(controllen, KM_SLEEP);
1185 1218
1186 1219 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1187 1220 if (copyin(msg->msg_control, control, controllen)) {
1188 1221 error = EFAULT;
1189 1222 goto done1;
1190 1223 }
1191 1224 msg->msg_control = control;
1192 1225 } else {
1193 1226 msg->msg_control = control = NULL;
1194 1227 msg->msg_controllen = controllen = 0;
1195 1228 }
1196 1229
1197 1230 len = uiop->uio_resid;
1198 1231 msg->msg_flags = flags;
1199 1232
1200 1233 error = socket_sendmsg(so, msg, uiop, CRED());
1201 1234 done1:
1202 1235 if (control != NULL)
1203 1236 kmem_free(control, controllen);
1204 1237 done2:
1205 1238 if (name != NULL)
1206 1239 kmem_free(name, namelen);
1207 1240 done3:
1208 1241 if (error != 0) {
1209 1242 releasef(sock);
1210 1243 return (set_errno(error));
1211 1244 }
1212 1245 lwp_stat_update(LWP_STAT_MSGSND, 1);
1213 1246 releasef(sock);
1214 1247 return (len - uiop->uio_resid);
1215 1248 }
1216 1249
1217 1250 /*
1218 1251 * Native system call
1219 1252 */
1220 1253 ssize_t
1221 1254 send(int sock, void *buffer, size_t len, int flags)
1222 1255 {
1223 1256 struct nmsghdr lmsg;
1224 1257 struct uio auio;
1225 1258 struct iovec aiov[1];
1226 1259
1227 1260 dprint(1, ("send(%d, %p, %ld, %d)\n",
1228 1261 sock, buffer, len, flags));
1229 1262
1230 1263 if ((ssize_t)len < 0) {
1231 1264 return (set_errno(EINVAL));
1232 1265 }
1233 1266
1234 1267 aiov[0].iov_base = buffer;
1235 1268 aiov[0].iov_len = len;
1236 1269 auio.uio_loffset = 0;
1237 1270 auio.uio_iov = aiov;
1238 1271 auio.uio_iovcnt = 1;
1239 1272 auio.uio_resid = len;
1240 1273 auio.uio_segflg = UIO_USERSPACE;
1241 1274 auio.uio_limit = 0;
1242 1275
1243 1276 lmsg.msg_name = NULL;
1244 1277 lmsg.msg_control = NULL;
1245 1278 if (!(flags & MSG_XPG4_2)) {
1246 1279 /*
1247 1280 * In order to be compatible with the libsocket/sockmod
1248 1281 * implementation we set EOR for all send* calls.
1249 1282 */
1250 1283 flags |= MSG_EOR;
1251 1284 }
1252 1285 return (sendit(sock, &lmsg, &auio, flags));
1253 1286 }
1254 1287
↓ open down ↓ |
117 lines elided |
↑ open up ↑ |
1255 1288 /*
1256 1289 * Uses the MSG_XPG4_2 flag to determine if the caller is using
1257 1290 * struct omsghdr or struct nmsghdr.
1258 1291 */
1259 1292 ssize_t
1260 1293 sendmsg(int sock, struct nmsghdr *msg, int flags)
1261 1294 {
1262 1295 struct nmsghdr lmsg;
1263 1296 STRUCT_DECL(nmsghdr, u_lmsg);
1264 1297 struct uio auio;
1265 - struct iovec aiov[MSG_MAXIOVLEN];
1298 + struct iovec buf[IOV_MAX_STACK], *aiov = buf;
1299 + ssize_t iovsize = 0;
1266 1300 int iovcnt;
1267 - ssize_t len;
1301 + ssize_t len, rval;
1268 1302 int i;
1269 1303 model_t model;
1270 1304
1271 1305 dprint(1, ("sendmsg(%d, %p, %d)\n", sock, (void *)msg, flags));
1272 1306
1273 1307 model = get_udatamodel();
1274 1308 STRUCT_INIT(u_lmsg, model);
1275 1309
1276 1310 if (flags & MSG_XPG4_2) {
1277 1311 if (copyin(msg, (char *)STRUCT_BUF(u_lmsg),
1278 1312 STRUCT_SIZE(u_lmsg)))
1279 1313 return (set_errno(EFAULT));
1280 1314 } else {
1281 1315 /*
1282 1316 * Assumes that nmsghdr and omsghdr are identically shaped
1283 1317 * except for the added msg_flags field.
1284 1318 */
1285 1319 if (copyin(msg, (char *)STRUCT_BUF(u_lmsg),
1286 1320 SIZEOF_STRUCT(omsghdr, model)))
1287 1321 return (set_errno(EFAULT));
1288 1322 /*
1289 1323 * In order to be compatible with the libsocket/sockmod
1290 1324 * implementation we set EOR for all send* calls.
1291 1325 */
1292 1326 flags |= MSG_EOR;
1293 1327 }
1294 1328
1295 1329 /*
1296 1330 * Code below us will kmem_alloc memory and hang it
1297 1331 * off msg_control and msg_name fields. This forces
1298 1332 * us to copy the structure to its native form.
1299 1333 */
↓ open down ↓ |
22 lines elided |
↑ open up ↑ |
1300 1334 lmsg.msg_name = STRUCT_FGETP(u_lmsg, msg_name);
1301 1335 lmsg.msg_namelen = STRUCT_FGET(u_lmsg, msg_namelen);
1302 1336 lmsg.msg_iov = STRUCT_FGETP(u_lmsg, msg_iov);
1303 1337 lmsg.msg_iovlen = STRUCT_FGET(u_lmsg, msg_iovlen);
1304 1338 lmsg.msg_control = STRUCT_FGETP(u_lmsg, msg_control);
1305 1339 lmsg.msg_controllen = STRUCT_FGET(u_lmsg, msg_controllen);
1306 1340 lmsg.msg_flags = STRUCT_FGET(u_lmsg, msg_flags);
1307 1341
1308 1342 iovcnt = lmsg.msg_iovlen;
1309 1343
1310 - if (iovcnt <= 0 || iovcnt > MSG_MAXIOVLEN) {
1344 + if (iovcnt <= 0 || iovcnt > IOV_MAX) {
1311 1345 /*
1312 1346 * Unless this is XPG 4.2 we allow iovcnt == 0 to
1313 1347 * be compatible with SunOS 4.X and 4.4BSD.
1314 1348 */
1315 1349 if (iovcnt != 0 || (flags & MSG_XPG4_2))
1316 1350 return (set_errno(EMSGSIZE));
1317 1351 }
1318 1352
1353 + if (iovcnt > IOV_MAX_STACK) {
1354 + iovsize = iovcnt * sizeof (struct iovec);
1355 + aiov = kmem_alloc(iovsize, KM_SLEEP);
1356 + }
1357 +
1319 1358 #ifdef _SYSCALL32_IMPL
1320 1359 /*
1321 1360 * 32-bit callers need to have their iovec expanded, while ensuring
1322 1361 * that they can't move more than 2Gbytes of data in a single call.
1323 1362 */
1324 1363 if (model == DATAMODEL_ILP32) {
1325 - struct iovec32 aiov32[MSG_MAXIOVLEN];
1364 + struct iovec32 buf32[IOV_MAX_STACK], *aiov32 = buf32;
1365 + ssize_t iov32size;
1326 1366 ssize32_t count32;
1327 1367
1368 + iov32size = iovcnt * sizeof (struct iovec32);
1369 + if (iovsize != 0)
1370 + aiov32 = kmem_alloc(iov32size, KM_SLEEP);
1371 +
1328 1372 if (iovcnt != 0 &&
1329 - copyin((struct iovec32 *)lmsg.msg_iov, aiov32,
1330 - iovcnt * sizeof (struct iovec32)))
1373 + copyin((struct iovec32 *)lmsg.msg_iov, aiov32, iov32size)) {
1374 + if (iovsize != 0) {
1375 + kmem_free(aiov32, iov32size);
1376 + kmem_free(aiov, iovsize);
1377 + }
1378 +
1331 1379 return (set_errno(EFAULT));
1380 + }
1332 1381
1333 1382 count32 = 0;
1334 1383 for (i = 0; i < iovcnt; i++) {
1335 1384 ssize32_t iovlen32;
1336 1385
1337 1386 iovlen32 = aiov32[i].iov_len;
1338 1387 count32 += iovlen32;
1339 - if (iovlen32 < 0 || count32 < 0)
1388 + if (iovlen32 < 0 || count32 < 0) {
1389 + if (iovsize != 0) {
1390 + kmem_free(aiov32, iov32size);
1391 + kmem_free(aiov, iovsize);
1392 + }
1393 +
1340 1394 return (set_errno(EINVAL));
1395 + }
1396 +
1341 1397 aiov[i].iov_len = iovlen32;
1342 1398 aiov[i].iov_base =
1343 1399 (caddr_t)(uintptr_t)aiov32[i].iov_base;
1344 1400 }
1401 +
1402 + if (iovsize != 0)
1403 + kmem_free(aiov32, iov32size);
1345 1404 } else
1346 1405 #endif /* _SYSCALL32_IMPL */
1347 1406 if (iovcnt != 0 &&
1348 1407 copyin(lmsg.msg_iov, aiov,
1349 1408 (unsigned)iovcnt * sizeof (struct iovec))) {
1409 + if (iovsize != 0)
1410 + kmem_free(aiov, iovsize);
1411 +
1350 1412 return (set_errno(EFAULT));
1351 1413 }
1352 1414 len = 0;
1353 1415 for (i = 0; i < iovcnt; i++) {
1354 1416 ssize_t iovlen = aiov[i].iov_len;
1355 1417 len += iovlen;
1356 1418 if (iovlen < 0 || len < 0) {
1419 + if (iovsize != 0)
1420 + kmem_free(aiov, iovsize);
1421 +
1357 1422 return (set_errno(EINVAL));
1358 1423 }
1359 1424 }
1360 1425 auio.uio_loffset = 0;
1361 1426 auio.uio_iov = aiov;
1362 1427 auio.uio_iovcnt = iovcnt;
1363 1428 auio.uio_resid = len;
1364 1429 auio.uio_segflg = UIO_USERSPACE;
1365 1430 auio.uio_limit = 0;
1366 1431
1367 - return (sendit(sock, &lmsg, &auio, flags));
1432 + rval = sendit(sock, &lmsg, &auio, flags);
1433 +
1434 + if (iovsize != 0)
1435 + kmem_free(aiov, iovsize);
1436 +
1437 + return (rval);
1368 1438 }
1369 1439
1370 1440 ssize_t
1371 1441 sendto(int sock, void *buffer, size_t len, int flags,
1372 1442 struct sockaddr *name, socklen_t namelen)
1373 1443 {
1374 1444 struct nmsghdr lmsg;
1375 1445 struct uio auio;
1376 1446 struct iovec aiov[1];
1377 1447
1378 1448 dprint(1, ("sendto(%d, %p, %ld, %d, %p, %d)\n",
1379 1449 sock, buffer, len, flags, (void *)name, namelen));
1380 1450
1381 1451 if ((ssize_t)len < 0) {
1382 1452 return (set_errno(EINVAL));
1383 1453 }
1384 1454
1385 1455 aiov[0].iov_base = buffer;
1386 1456 aiov[0].iov_len = len;
1387 1457 auio.uio_loffset = 0;
1388 1458 auio.uio_iov = aiov;
1389 1459 auio.uio_iovcnt = 1;
1390 1460 auio.uio_resid = len;
1391 1461 auio.uio_segflg = UIO_USERSPACE;
1392 1462 auio.uio_limit = 0;
1393 1463
1394 1464 lmsg.msg_name = (char *)name;
1395 1465 lmsg.msg_namelen = namelen;
1396 1466 lmsg.msg_control = NULL;
1397 1467 if (!(flags & MSG_XPG4_2)) {
1398 1468 /*
1399 1469 * In order to be compatible with the libsocket/sockmod
1400 1470 * implementation we set EOR for all send* calls.
1401 1471 */
1402 1472 flags |= MSG_EOR;
1403 1473 }
1404 1474 return (sendit(sock, &lmsg, &auio, flags));
1405 1475 }
1406 1476
1407 1477 /*ARGSUSED3*/
1408 1478 int
1409 1479 getpeername(int sock, struct sockaddr *name, socklen_t *namelenp, int version)
1410 1480 {
1411 1481 struct sonode *so;
1412 1482 int error;
1413 1483 socklen_t namelen;
1414 1484 socklen_t sock_addrlen;
1415 1485 struct sockaddr *sock_addrp;
1416 1486
1417 1487 dprint(1, ("getpeername(%d, %p, %p)\n",
1418 1488 sock, (void *)name, (void *)namelenp));
1419 1489
1420 1490 if ((so = getsonode(sock, &error, NULL)) == NULL)
1421 1491 goto bad;
1422 1492
1423 1493 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1424 1494 if (copyin(namelenp, &namelen, sizeof (namelen)) ||
1425 1495 (name == NULL && namelen != 0)) {
1426 1496 error = EFAULT;
1427 1497 goto rel_out;
1428 1498 }
1429 1499 sock_addrlen = so->so_max_addr_len;
1430 1500 sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP);
1431 1501
1432 1502 if ((error = socket_getpeername(so, sock_addrp, &sock_addrlen,
1433 1503 B_FALSE, CRED())) == 0) {
1434 1504 ASSERT(sock_addrlen <= so->so_max_addr_len);
1435 1505 error = copyout_name(name, namelen, namelenp,
1436 1506 (void *)sock_addrp, sock_addrlen);
1437 1507 }
1438 1508 kmem_free(sock_addrp, so->so_max_addr_len);
1439 1509 rel_out:
1440 1510 releasef(sock);
1441 1511 bad: return (error != 0 ? set_errno(error) : 0);
1442 1512 }
1443 1513
1444 1514 /*ARGSUSED3*/
1445 1515 int
1446 1516 getsockname(int sock, struct sockaddr *name, socklen_t *namelenp, int version)
1447 1517 {
1448 1518 struct sonode *so;
1449 1519 int error;
1450 1520 socklen_t namelen, sock_addrlen;
1451 1521 struct sockaddr *sock_addrp;
1452 1522
1453 1523 dprint(1, ("getsockname(%d, %p, %p)\n",
1454 1524 sock, (void *)name, (void *)namelenp));
1455 1525
1456 1526 if ((so = getsonode(sock, &error, NULL)) == NULL)
1457 1527 goto bad;
1458 1528
1459 1529 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1460 1530 if (copyin(namelenp, &namelen, sizeof (namelen)) ||
1461 1531 (name == NULL && namelen != 0)) {
1462 1532 error = EFAULT;
1463 1533 goto rel_out;
1464 1534 }
1465 1535
1466 1536 sock_addrlen = so->so_max_addr_len;
1467 1537 sock_addrp = (struct sockaddr *)kmem_alloc(sock_addrlen, KM_SLEEP);
1468 1538 if ((error = socket_getsockname(so, sock_addrp, &sock_addrlen,
1469 1539 CRED())) == 0) {
1470 1540 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1471 1541 ASSERT(sock_addrlen <= so->so_max_addr_len);
1472 1542 error = copyout_name(name, namelen, namelenp,
1473 1543 (void *)sock_addrp, sock_addrlen);
1474 1544 }
1475 1545 kmem_free(sock_addrp, so->so_max_addr_len);
1476 1546 rel_out:
1477 1547 releasef(sock);
1478 1548 bad: return (error != 0 ? set_errno(error) : 0);
1479 1549 }
1480 1550
1481 1551 /*ARGSUSED5*/
1482 1552 int
1483 1553 getsockopt(int sock, int level, int option_name, void *option_value,
1484 1554 socklen_t *option_lenp, int version)
1485 1555 {
1486 1556 struct sonode *so;
1487 1557 socklen_t optlen, optlen_res;
1488 1558 void *optval;
1489 1559 int error;
1490 1560
1491 1561 dprint(1, ("getsockopt(%d, %d, %d, %p, %p)\n",
1492 1562 sock, level, option_name, option_value, (void *)option_lenp));
1493 1563
1494 1564 if ((so = getsonode(sock, &error, NULL)) == NULL)
1495 1565 return (set_errno(error));
1496 1566
1497 1567 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1498 1568 if (copyin(option_lenp, &optlen, sizeof (optlen))) {
1499 1569 releasef(sock);
1500 1570 return (set_errno(EFAULT));
1501 1571 }
1502 1572 /*
1503 1573 * Verify that the length is not excessive to prevent
1504 1574 * an application from consuming all of kernel memory.
1505 1575 */
1506 1576 if (optlen > SO_MAXARGSIZE) {
1507 1577 error = EINVAL;
1508 1578 releasef(sock);
1509 1579 return (set_errno(error));
1510 1580 }
1511 1581 optval = kmem_alloc(optlen, KM_SLEEP);
1512 1582 optlen_res = optlen;
1513 1583 error = socket_getsockopt(so, level, option_name, optval,
1514 1584 &optlen_res, (version != SOV_XPG4_2) ? 0 : _SOGETSOCKOPT_XPG4_2,
1515 1585 CRED());
1516 1586 releasef(sock);
1517 1587 if (error) {
1518 1588 kmem_free(optval, optlen);
1519 1589 return (set_errno(error));
1520 1590 }
1521 1591 error = copyout_arg(option_value, optlen, option_lenp,
1522 1592 optval, optlen_res);
1523 1593 kmem_free(optval, optlen);
1524 1594 if (error)
1525 1595 return (set_errno(error));
1526 1596 return (0);
1527 1597 }
1528 1598
1529 1599 /*ARGSUSED5*/
1530 1600 int
1531 1601 setsockopt(int sock, int level, int option_name, void *option_value,
1532 1602 socklen_t option_len, int version)
1533 1603 {
1534 1604 struct sonode *so;
1535 1605 intptr_t buffer[2];
1536 1606 void *optval = NULL;
1537 1607 int error;
1538 1608
1539 1609 dprint(1, ("setsockopt(%d, %d, %d, %p, %d)\n",
1540 1610 sock, level, option_name, option_value, option_len));
1541 1611
1542 1612 if ((so = getsonode(sock, &error, NULL)) == NULL)
1543 1613 return (set_errno(error));
1544 1614
1545 1615 if (option_value != NULL) {
1546 1616 if (option_len != 0) {
1547 1617 /*
1548 1618 * Verify that the length is not excessive to prevent
1549 1619 * an application from consuming all of kernel memory.
1550 1620 */
1551 1621 if (option_len > SO_MAXARGSIZE) {
1552 1622 error = EINVAL;
1553 1623 goto done2;
1554 1624 }
1555 1625 optval = option_len <= sizeof (buffer) ?
1556 1626 &buffer : kmem_alloc((size_t)option_len, KM_SLEEP);
1557 1627 ASSERT(MUTEX_NOT_HELD(&so->so_lock));
1558 1628 if (copyin(option_value, optval, (size_t)option_len)) {
1559 1629 error = EFAULT;
1560 1630 goto done1;
1561 1631 }
1562 1632 }
1563 1633 } else
1564 1634 option_len = 0;
1565 1635
1566 1636 error = socket_setsockopt(so, level, option_name, optval,
1567 1637 (t_uscalar_t)option_len, CRED());
1568 1638 done1:
1569 1639 if (optval != buffer)
1570 1640 kmem_free(optval, (size_t)option_len);
1571 1641 done2:
1572 1642 releasef(sock);
1573 1643 if (error)
1574 1644 return (set_errno(error));
1575 1645 return (0);
1576 1646 }
1577 1647
1578 1648 static int
1579 1649 sockconf_add_sock(int family, int type, int protocol, char *name)
1580 1650 {
1581 1651 int error = 0;
1582 1652 char *kdevpath = NULL;
1583 1653 char *kmodule = NULL;
1584 1654 char *buf = NULL;
1585 1655 size_t pathlen = 0;
1586 1656 struct sockparams *sp;
1587 1657
1588 1658 if (name == NULL)
1589 1659 return (EINVAL);
1590 1660 /*
1591 1661 * Copyin the name.
1592 1662 * This also makes it possible to check for too long pathnames.
1593 1663 * Compress the space needed for the name before passing it
1594 1664 * to soconfig - soconfig will store the string until
1595 1665 * the configuration is removed.
1596 1666 */
1597 1667 buf = kmem_alloc(MAXPATHLEN, KM_SLEEP);
1598 1668 if ((error = copyinstr(name, buf, MAXPATHLEN, &pathlen)) != 0) {
1599 1669 kmem_free(buf, MAXPATHLEN);
1600 1670 return (error);
1601 1671 }
1602 1672 if (strncmp(buf, "/dev", strlen("/dev")) == 0) {
1603 1673 /* For device */
1604 1674
1605 1675 /*
1606 1676 * Special handling for NCA:
1607 1677 *
1608 1678 * DEV_NCA is never opened even if an application
1609 1679 * requests for AF_NCA. The device opened is instead a
1610 1680 * predefined AF_INET transport (NCA_INET_DEV).
1611 1681 *
1612 1682 * Prior to Volo (PSARC/2007/587) NCA would determine
1613 1683 * the device using a lookup, which worked then because
1614 1684 * all protocols were based on TPI. Since TPI is no
1615 1685 * longer the default, we have to explicitly state
1616 1686 * which device to use.
1617 1687 */
1618 1688 if (strcmp(buf, NCA_DEV) == 0) {
1619 1689 /* only support entry <28, 2, 0> */
1620 1690 if (family != AF_NCA || type != SOCK_STREAM ||
1621 1691 protocol != 0) {
1622 1692 kmem_free(buf, MAXPATHLEN);
1623 1693 return (EINVAL);
1624 1694 }
1625 1695
1626 1696 pathlen = strlen(NCA_INET_DEV) + 1;
1627 1697 kdevpath = kmem_alloc(pathlen, KM_SLEEP);
1628 1698 bcopy(NCA_INET_DEV, kdevpath, pathlen);
1629 1699 kdevpath[pathlen - 1] = '\0';
1630 1700 } else {
1631 1701 kdevpath = kmem_alloc(pathlen, KM_SLEEP);
1632 1702 bcopy(buf, kdevpath, pathlen);
1633 1703 kdevpath[pathlen - 1] = '\0';
1634 1704 }
1635 1705 } else {
1636 1706 /* For socket module */
1637 1707 kmodule = kmem_alloc(pathlen, KM_SLEEP);
1638 1708 bcopy(buf, kmodule, pathlen);
1639 1709 kmodule[pathlen - 1] = '\0';
1640 1710 pathlen = 0;
1641 1711 }
1642 1712 kmem_free(buf, MAXPATHLEN);
1643 1713
1644 1714 /* sockparams_create frees mod name and devpath upon failure */
1645 1715 sp = sockparams_create(family, type, protocol, kmodule,
1646 1716 kdevpath, pathlen, 0, KM_SLEEP, &error);
1647 1717 if (sp != NULL) {
1648 1718 error = sockparams_add(sp);
1649 1719 if (error != 0)
1650 1720 sockparams_destroy(sp);
1651 1721 }
1652 1722
1653 1723 return (error);
1654 1724 }
1655 1725
1656 1726 static int
1657 1727 sockconf_remove_sock(int family, int type, int protocol)
1658 1728 {
1659 1729 return (sockparams_delete(family, type, protocol));
1660 1730 }
1661 1731
1662 1732 static int
1663 1733 sockconfig_remove_filter(const char *uname)
1664 1734 {
1665 1735 char kname[SOF_MAXNAMELEN];
1666 1736 size_t len;
1667 1737 int error;
1668 1738 sof_entry_t *ent;
1669 1739
1670 1740 if ((error = copyinstr(uname, kname, SOF_MAXNAMELEN, &len)) != 0)
1671 1741 return (error);
1672 1742
1673 1743 ent = sof_entry_remove_by_name(kname);
1674 1744 if (ent == NULL)
1675 1745 return (ENXIO);
1676 1746
1677 1747 mutex_enter(&ent->sofe_lock);
1678 1748 ASSERT(!(ent->sofe_flags & SOFEF_CONDEMED));
1679 1749 if (ent->sofe_refcnt == 0) {
1680 1750 mutex_exit(&ent->sofe_lock);
1681 1751 sof_entry_free(ent);
1682 1752 } else {
1683 1753 /* let the last socket free the filter */
1684 1754 ent->sofe_flags |= SOFEF_CONDEMED;
1685 1755 mutex_exit(&ent->sofe_lock);
1686 1756 }
1687 1757
1688 1758 return (0);
1689 1759 }
1690 1760
1691 1761 static int
1692 1762 sockconfig_add_filter(const char *uname, void *ufilpropp)
1693 1763 {
1694 1764 struct sockconfig_filter_props filprop;
1695 1765 sof_entry_t *ent;
1696 1766 int error;
1697 1767 size_t tuplesz, len;
1698 1768 char hintbuf[SOF_MAXNAMELEN];
1699 1769
1700 1770 ent = kmem_zalloc(sizeof (sof_entry_t), KM_SLEEP);
1701 1771 mutex_init(&ent->sofe_lock, NULL, MUTEX_DEFAULT, NULL);
1702 1772
1703 1773 if ((error = copyinstr(uname, ent->sofe_name, SOF_MAXNAMELEN,
1704 1774 &len)) != 0) {
1705 1775 sof_entry_free(ent);
1706 1776 return (error);
1707 1777 }
1708 1778
1709 1779 if (get_udatamodel() == DATAMODEL_NATIVE) {
1710 1780 if (copyin(ufilpropp, &filprop, sizeof (filprop)) != 0) {
1711 1781 sof_entry_free(ent);
1712 1782 return (EFAULT);
1713 1783 }
1714 1784 }
1715 1785 #ifdef _SYSCALL32_IMPL
1716 1786 else {
1717 1787 struct sockconfig_filter_props32 filprop32;
1718 1788
1719 1789 if (copyin(ufilpropp, &filprop32, sizeof (filprop32)) != 0) {
1720 1790 sof_entry_free(ent);
1721 1791 return (EFAULT);
1722 1792 }
1723 1793 filprop.sfp_modname = (char *)(uintptr_t)filprop32.sfp_modname;
1724 1794 filprop.sfp_autoattach = filprop32.sfp_autoattach;
1725 1795 filprop.sfp_hint = filprop32.sfp_hint;
1726 1796 filprop.sfp_hintarg = (char *)(uintptr_t)filprop32.sfp_hintarg;
1727 1797 filprop.sfp_socktuple_cnt = filprop32.sfp_socktuple_cnt;
1728 1798 filprop.sfp_socktuple =
1729 1799 (sof_socktuple_t *)(uintptr_t)filprop32.sfp_socktuple;
1730 1800 }
1731 1801 #endif /* _SYSCALL32_IMPL */
1732 1802
1733 1803 if ((error = copyinstr(filprop.sfp_modname, ent->sofe_modname,
1734 1804 sizeof (ent->sofe_modname), &len)) != 0) {
1735 1805 sof_entry_free(ent);
1736 1806 return (error);
1737 1807 }
1738 1808
1739 1809 /*
1740 1810 * A filter must specify at least one socket tuple.
1741 1811 */
1742 1812 if (filprop.sfp_socktuple_cnt == 0 ||
1743 1813 filprop.sfp_socktuple_cnt > SOF_MAXSOCKTUPLECNT) {
1744 1814 sof_entry_free(ent);
1745 1815 return (EINVAL);
1746 1816 }
1747 1817 ent->sofe_flags = filprop.sfp_autoattach ? SOFEF_AUTO : SOFEF_PROG;
1748 1818 ent->sofe_hint = filprop.sfp_hint;
1749 1819
1750 1820 /*
1751 1821 * Verify the hint, and copy in the hint argument, if necessary.
1752 1822 */
1753 1823 switch (ent->sofe_hint) {
1754 1824 case SOF_HINT_BEFORE:
1755 1825 case SOF_HINT_AFTER:
1756 1826 if ((error = copyinstr(filprop.sfp_hintarg, hintbuf,
1757 1827 sizeof (hintbuf), &len)) != 0) {
1758 1828 sof_entry_free(ent);
1759 1829 return (error);
1760 1830 }
1761 1831 ent->sofe_hintarg = kmem_alloc(len, KM_SLEEP);
1762 1832 bcopy(hintbuf, ent->sofe_hintarg, len);
1763 1833 /* FALLTHRU */
1764 1834 case SOF_HINT_TOP:
1765 1835 case SOF_HINT_BOTTOM:
1766 1836 /* hints cannot be used with programmatic filters */
1767 1837 if (ent->sofe_flags & SOFEF_PROG) {
1768 1838 sof_entry_free(ent);
1769 1839 return (EINVAL);
1770 1840 }
1771 1841 break;
1772 1842 case SOF_HINT_NONE:
1773 1843 break;
1774 1844 default:
1775 1845 /* bad hint value */
1776 1846 sof_entry_free(ent);
1777 1847 return (EINVAL);
1778 1848 }
1779 1849
1780 1850 ent->sofe_socktuple_cnt = filprop.sfp_socktuple_cnt;
1781 1851 tuplesz = sizeof (sof_socktuple_t) * ent->sofe_socktuple_cnt;
1782 1852 ent->sofe_socktuple = kmem_alloc(tuplesz, KM_SLEEP);
1783 1853
1784 1854 if (get_udatamodel() == DATAMODEL_NATIVE) {
1785 1855 if (copyin(filprop.sfp_socktuple, ent->sofe_socktuple,
1786 1856 tuplesz)) {
1787 1857 sof_entry_free(ent);
1788 1858 return (EFAULT);
1789 1859 }
1790 1860 }
1791 1861 #ifdef _SYSCALL32_IMPL
1792 1862 else {
1793 1863 int i;
1794 1864 caddr_t data = (caddr_t)filprop.sfp_socktuple;
1795 1865 sof_socktuple_t *tup = ent->sofe_socktuple;
1796 1866 sof_socktuple32_t tup32;
1797 1867
1798 1868 tup = ent->sofe_socktuple;
1799 1869 for (i = 0; i < ent->sofe_socktuple_cnt; i++, tup++) {
1800 1870 ASSERT(tup < ent->sofe_socktuple + tuplesz);
1801 1871
1802 1872 if (copyin(data, &tup32, sizeof (tup32)) != 0) {
1803 1873 sof_entry_free(ent);
1804 1874 return (EFAULT);
1805 1875 }
1806 1876 tup->sofst_family = tup32.sofst_family;
1807 1877 tup->sofst_type = tup32.sofst_type;
1808 1878 tup->sofst_protocol = tup32.sofst_protocol;
1809 1879
1810 1880 data += sizeof (tup32);
1811 1881 }
1812 1882 }
1813 1883 #endif /* _SYSCALL32_IMPL */
1814 1884
1815 1885 /* Sockets can start using the filter as soon as the filter is added */
1816 1886 if ((error = sof_entry_add(ent)) != 0)
1817 1887 sof_entry_free(ent);
1818 1888
1819 1889 return (error);
1820 1890 }
1821 1891
1822 1892 /*
1823 1893 * Socket configuration system call. It is used to add and remove
1824 1894 * socket types.
1825 1895 */
1826 1896 int
1827 1897 sockconfig(int cmd, void *arg1, void *arg2, void *arg3, void *arg4)
1828 1898 {
1829 1899 int error = 0;
1830 1900
1831 1901 if (secpolicy_net_config(CRED(), B_FALSE) != 0)
1832 1902 return (set_errno(EPERM));
1833 1903
1834 1904 if (sockfs_defer_nl7c_init) {
1835 1905 nl7c_init();
1836 1906 sockfs_defer_nl7c_init = 0;
1837 1907 }
1838 1908
1839 1909 switch (cmd) {
1840 1910 case SOCKCONFIG_ADD_SOCK:
1841 1911 error = sockconf_add_sock((int)(uintptr_t)arg1,
1842 1912 (int)(uintptr_t)arg2, (int)(uintptr_t)arg3, arg4);
1843 1913 break;
1844 1914 case SOCKCONFIG_REMOVE_SOCK:
1845 1915 error = sockconf_remove_sock((int)(uintptr_t)arg1,
1846 1916 (int)(uintptr_t)arg2, (int)(uintptr_t)arg3);
1847 1917 break;
1848 1918 case SOCKCONFIG_ADD_FILTER:
1849 1919 error = sockconfig_add_filter((const char *)arg1, arg2);
1850 1920 break;
1851 1921 case SOCKCONFIG_REMOVE_FILTER:
1852 1922 error = sockconfig_remove_filter((const char *)arg1);
1853 1923 break;
1854 1924 case SOCKCONFIG_GET_SOCKTABLE:
1855 1925 error = sockparams_copyout_socktable((int)(uintptr_t)arg1);
1856 1926 break;
1857 1927 default:
1858 1928 #ifdef DEBUG
1859 1929 cmn_err(CE_NOTE, "sockconfig: unkonwn subcommand %d", cmd);
1860 1930 #endif
1861 1931 error = EINVAL;
1862 1932 break;
1863 1933 }
1864 1934
1865 1935 if (error != 0) {
1866 1936 eprintline(error);
1867 1937 return (set_errno(error));
1868 1938 }
1869 1939 return (0);
1870 1940 }
1871 1941
1872 1942
1873 1943 /*
1874 1944 * Sendfile is implemented through two schemes, direct I/O or by
1875 1945 * caching in the filesystem page cache. We cache the input file by
1876 1946 * default and use direct I/O only if sendfile_max_size is set
1877 1947 * appropriately as explained below. Note that this logic is consistent
1878 1948 * with other filesystems where caching is turned on by default
1879 1949 * unless explicitly turned off by using the DIRECTIO ioctl.
1880 1950 *
1881 1951 * We choose a slightly different scheme here. One can turn off
1882 1952 * caching by setting sendfile_max_size to 0. One can also enable
1883 1953 * caching of files <= sendfile_max_size by setting sendfile_max_size
1884 1954 * to an appropriate value. By default sendfile_max_size is set to the
1885 1955 * maximum value so that all files are cached. In future, we may provide
1886 1956 * better interfaces for caching the file.
1887 1957 *
1888 1958 * Sendfile through Direct I/O (Zero copy)
1889 1959 * --------------------------------------
1890 1960 *
1891 1961 * As disks are normally slower than the network, we can't have a
1892 1962 * single thread that reads the disk and writes to the network. We
1893 1963 * need to have parallelism. This is done by having the sendfile
1894 1964 * thread create another thread that reads from the filesystem
1895 1965 * and queues it for network processing. In this scheme, the data
1896 1966 * is never copied anywhere i.e it is zero copy unlike the other
1897 1967 * scheme.
1898 1968 *
1899 1969 * We have a sendfile queue (snfq) where each sendfile
1900 1970 * request (snf_req_t) is queued for processing by a thread. Number
1901 1971 * of threads is dynamically allocated and they exit if they are idling
1902 1972 * beyond a specified amount of time. When each request (snf_req_t) is
1903 1973 * processed by a thread, it produces a number of mblk_t structures to
1904 1974 * be consumed by the sendfile thread. snf_deque and snf_enque are
1905 1975 * used for consuming and producing mblks. Size of the filesystem
1906 1976 * read is determined by the tunable (sendfile_read_size). A single
1907 1977 * mblk holds sendfile_read_size worth of data (except the last
1908 1978 * read of the file) which is sent down as a whole to the network.
1909 1979 * sendfile_read_size is set to 1 MB as this seems to be the optimal
1910 1980 * value for the UFS filesystem backed by a striped storage array.
1911 1981 *
1912 1982 * Synchronisation between read (producer) and write (consumer) threads.
1913 1983 * --------------------------------------------------------------------
1914 1984 *
1915 1985 * sr_lock protects sr_ib_head and sr_ib_tail. The lock is held while
1916 1986 * adding and deleting items in this list. Error can happen anytime
1917 1987 * during read or write. There could be unprocessed mblks in the
1918 1988 * sr_ib_XXX list when a read or write error occurs. Whenever error
1919 1989 * is encountered, we need two things to happen :
1920 1990 *
1921 1991 * a) One of the threads need to clean the mblks.
1922 1992 * b) When one thread encounters an error, the other should stop.
1923 1993 *
1924 1994 * For (a), we don't want to penalize the reader thread as it could do
1925 1995 * some useful work processing other requests. For (b), the error can
1926 1996 * be detected by examining sr_read_error or sr_write_error.
1927 1997 * sr_lock protects sr_read_error and sr_write_error. If both reader and
1928 1998 * writer encounters error, we need to report the write error back to
1929 1999 * the application as that's what would have happened if the operations
1930 2000 * were done sequentially. With this in mind, following should work :
1931 2001 *
1932 2002 * - Check for errors before read or write.
1933 2003 * - If the reader encounters error, set the error in sr_read_error.
1934 2004 * Check sr_write_error, if it is set, send cv_signal as it is
1935 2005 * waiting for reader to complete. If it is not set, the writer
1936 2006 * is either running sinking data to the network or blocked
1937 2007 * because of flow control. For handling the latter case, we
1938 2008 * always send a signal. In any case, it will examine sr_read_error
1939 2009 * and return. sr_read_error is marked with SR_READ_DONE to tell
1940 2010 * the writer that the reader is done in all the cases.
1941 2011 * - If the writer encounters error, set the error in sr_write_error.
1942 2012 * The reader thread is either blocked because of flow control or
1943 2013 * running reading data from the disk. For the former, we need to
1944 2014 * wakeup the thread. Again to keep it simple, we always wake up
1945 2015 * the reader thread. Then, wait for the read thread to complete
1946 2016 * if it is not done yet. Cleanup and return.
1947 2017 *
1948 2018 * High and low water marks for the read thread.
1949 2019 * --------------------------------------------
1950 2020 *
1951 2021 * If sendfile() is used to send data over a slow network, we need to
1952 2022 * make sure that the read thread does not produce data at a faster
1953 2023 * rate than the network. This can happen if the disk is faster than
1954 2024 * the network. In such a case, we don't want to build a very large queue.
1955 2025 * But we would still like to get all of the network throughput possible.
1956 2026 * This implies that network should never block waiting for data.
1957 2027 * As there are lot of disk throughput/network throughput combinations
1958 2028 * possible, it is difficult to come up with an accurate number.
1959 2029 * A typical 10K RPM disk has a max seek latency 17ms and rotational
1960 2030 * latency of 3ms for reading a disk block. Thus, the total latency to
1961 2031 * initiate a new read, transfer data from the disk and queue for
1962 2032 * transmission would take about a max of 25ms. Todays max transfer rate
1963 2033 * for network is 100MB/sec. If the thread is blocked because of flow
1964 2034 * control, it would take 25ms to get new data ready for transmission.
1965 2035 * We have to make sure that network is not idling, while we are initiating
1966 2036 * new transfers. So, at 100MB/sec, to keep network busy we would need
1967 2037 * 2.5MB of data. Rounding off, we keep the low water mark to be 3MB of data.
1968 2038 * We need to pick a high water mark so that the woken up thread would
1969 2039 * do considerable work before blocking again to prevent thrashing. Currently,
1970 2040 * we pick this to be 10 times that of the low water mark.
1971 2041 *
1972 2042 * Sendfile with segmap caching (One copy from page cache to mblks).
1973 2043 * ----------------------------------------------------------------
1974 2044 *
1975 2045 * We use the segmap cache for caching the file, if the size of file
1976 2046 * is <= sendfile_max_size. In this case we don't use threads as VM
1977 2047 * is reasonably fast enough to keep up with the network. If the underlying
1978 2048 * transport allows, we call segmap_getmapflt() to map MAXBSIZE (8K) worth
1979 2049 * of data into segmap space, and use the virtual address from segmap
1980 2050 * directly through desballoc() to avoid copy. Once the transport is done
1981 2051 * with the data, the mapping will be released through segmap_release()
1982 2052 * called by the call-back routine.
1983 2053 *
1984 2054 * If zero-copy is not allowed by the transport, we simply call VOP_READ()
1985 2055 * to copy the data from the filesystem into our temporary network buffer.
1986 2056 *
1987 2057 * To disable caching, set sendfile_max_size to 0.
1988 2058 */
1989 2059
1990 2060 uint_t sendfile_read_size = 1024 * 1024;
1991 2061 #define SENDFILE_REQ_LOWAT 3 * 1024 * 1024
1992 2062 uint_t sendfile_req_lowat = SENDFILE_REQ_LOWAT;
1993 2063 uint_t sendfile_req_hiwat = 10 * SENDFILE_REQ_LOWAT;
1994 2064 struct sendfile_stats sf_stats;
1995 2065 struct sendfile_queue *snfq;
1996 2066 clock_t snfq_timeout;
1997 2067 off64_t sendfile_max_size;
1998 2068
1999 2069 static void snf_enque(snf_req_t *, mblk_t *);
2000 2070 static mblk_t *snf_deque(snf_req_t *);
2001 2071
2002 2072 void
2003 2073 sendfile_init(void)
2004 2074 {
2005 2075 snfq = kmem_zalloc(sizeof (struct sendfile_queue), KM_SLEEP);
2006 2076
2007 2077 mutex_init(&snfq->snfq_lock, NULL, MUTEX_DEFAULT, NULL);
2008 2078 cv_init(&snfq->snfq_cv, NULL, CV_DEFAULT, NULL);
2009 2079 snfq->snfq_max_threads = max_ncpus;
2010 2080 snfq_timeout = SNFQ_TIMEOUT;
2011 2081 /* Cache all files by default. */
2012 2082 sendfile_max_size = MAXOFFSET_T;
2013 2083 }
2014 2084
2015 2085 /*
2016 2086 * Queues a mblk_t for network processing.
2017 2087 */
2018 2088 static void
2019 2089 snf_enque(snf_req_t *sr, mblk_t *mp)
2020 2090 {
2021 2091 mp->b_next = NULL;
2022 2092 mutex_enter(&sr->sr_lock);
2023 2093 if (sr->sr_mp_head == NULL) {
2024 2094 sr->sr_mp_head = sr->sr_mp_tail = mp;
2025 2095 cv_signal(&sr->sr_cv);
2026 2096 } else {
2027 2097 sr->sr_mp_tail->b_next = mp;
2028 2098 sr->sr_mp_tail = mp;
2029 2099 }
2030 2100 sr->sr_qlen += MBLKL(mp);
2031 2101 while ((sr->sr_qlen > sr->sr_hiwat) &&
2032 2102 (sr->sr_write_error == 0)) {
2033 2103 sf_stats.ss_full_waits++;
2034 2104 cv_wait(&sr->sr_cv, &sr->sr_lock);
2035 2105 }
2036 2106 mutex_exit(&sr->sr_lock);
2037 2107 }
2038 2108
2039 2109 /*
2040 2110 * De-queues a mblk_t for network processing.
2041 2111 */
2042 2112 static mblk_t *
2043 2113 snf_deque(snf_req_t *sr)
2044 2114 {
2045 2115 mblk_t *mp;
2046 2116
2047 2117 mutex_enter(&sr->sr_lock);
2048 2118 /*
2049 2119 * If we have encountered an error on read or read is
2050 2120 * completed and no more mblks, return NULL.
2051 2121 * We need to check for NULL sr_mp_head also as
2052 2122 * the reads could have completed and there is
2053 2123 * nothing more to come.
2054 2124 */
2055 2125 if (((sr->sr_read_error & ~SR_READ_DONE) != 0) ||
2056 2126 ((sr->sr_read_error & SR_READ_DONE) &&
2057 2127 sr->sr_mp_head == NULL)) {
2058 2128 mutex_exit(&sr->sr_lock);
2059 2129 return (NULL);
2060 2130 }
2061 2131 /*
2062 2132 * To start with neither SR_READ_DONE is marked nor
2063 2133 * the error is set. When we wake up from cv_wait,
2064 2134 * following are the possibilities :
2065 2135 *
2066 2136 * a) sr_read_error is zero and mblks are queued.
2067 2137 * b) sr_read_error is set to SR_READ_DONE
2068 2138 * and mblks are queued.
2069 2139 * c) sr_read_error is set to SR_READ_DONE
2070 2140 * and no mblks.
2071 2141 * d) sr_read_error is set to some error other
2072 2142 * than SR_READ_DONE.
2073 2143 */
2074 2144
2075 2145 while ((sr->sr_read_error == 0) && (sr->sr_mp_head == NULL)) {
2076 2146 sf_stats.ss_empty_waits++;
2077 2147 cv_wait(&sr->sr_cv, &sr->sr_lock);
2078 2148 }
2079 2149 /* Handle (a) and (b) first - the normal case. */
2080 2150 if (((sr->sr_read_error & ~SR_READ_DONE) == 0) &&
2081 2151 (sr->sr_mp_head != NULL)) {
2082 2152 mp = sr->sr_mp_head;
2083 2153 sr->sr_mp_head = mp->b_next;
2084 2154 sr->sr_qlen -= MBLKL(mp);
2085 2155 if (sr->sr_qlen < sr->sr_lowat)
2086 2156 cv_signal(&sr->sr_cv);
2087 2157 mutex_exit(&sr->sr_lock);
2088 2158 mp->b_next = NULL;
2089 2159 return (mp);
2090 2160 }
2091 2161 /* Handle (c) and (d). */
2092 2162 mutex_exit(&sr->sr_lock);
2093 2163 return (NULL);
2094 2164 }
2095 2165
2096 2166 /*
2097 2167 * Reads data from the filesystem and queues it for network processing.
2098 2168 */
2099 2169 void
2100 2170 snf_async_read(snf_req_t *sr)
2101 2171 {
2102 2172 size_t iosize;
2103 2173 u_offset_t fileoff;
2104 2174 u_offset_t size;
2105 2175 int ret_size;
2106 2176 int error;
2107 2177 file_t *fp;
2108 2178 mblk_t *mp;
2109 2179 struct vnode *vp;
2110 2180 int extra = 0;
2111 2181 int maxblk = 0;
2112 2182 int wroff = 0;
2113 2183 struct sonode *so;
2114 2184
2115 2185 fp = sr->sr_fp;
2116 2186 size = sr->sr_file_size;
2117 2187 fileoff = sr->sr_file_off;
2118 2188
2119 2189 /*
2120 2190 * Ignore the error for filesystems that doesn't support DIRECTIO.
2121 2191 */
2122 2192 (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_ON, 0,
2123 2193 kcred, NULL, NULL);
2124 2194
2125 2195 vp = sr->sr_vp;
2126 2196 if (vp->v_type == VSOCK) {
2127 2197 stdata_t *stp;
2128 2198
2129 2199 /*
2130 2200 * Get the extra space to insert a header and a trailer.
2131 2201 */
2132 2202 so = VTOSO(vp);
2133 2203 stp = vp->v_stream;
2134 2204 if (stp == NULL) {
2135 2205 wroff = so->so_proto_props.sopp_wroff;
2136 2206 maxblk = so->so_proto_props.sopp_maxblk;
2137 2207 extra = wroff + so->so_proto_props.sopp_tail;
2138 2208 } else {
2139 2209 wroff = (int)(stp->sd_wroff);
2140 2210 maxblk = (int)(stp->sd_maxblk);
2141 2211 extra = wroff + (int)(stp->sd_tail);
2142 2212 }
2143 2213 }
2144 2214
2145 2215 while ((size != 0) && (sr->sr_write_error == 0)) {
2146 2216
2147 2217 iosize = (int)MIN(sr->sr_maxpsz, size);
2148 2218
2149 2219 /*
2150 2220 * Socket filters can limit the mblk size,
2151 2221 * so limit reads to maxblk if there are
2152 2222 * filters present.
2153 2223 */
2154 2224 if (vp->v_type == VSOCK &&
2155 2225 so->so_filter_active > 0 && maxblk != INFPSZ)
2156 2226 iosize = (int)MIN(iosize, maxblk);
2157 2227
2158 2228 if (is_system_labeled()) {
2159 2229 mp = allocb_cred(iosize + extra, CRED(),
2160 2230 curproc->p_pid);
2161 2231 } else {
2162 2232 mp = allocb(iosize + extra, BPRI_MED);
2163 2233 }
2164 2234 if (mp == NULL) {
2165 2235 error = EAGAIN;
2166 2236 break;
2167 2237 }
2168 2238
2169 2239 mp->b_rptr += wroff;
2170 2240
2171 2241 ret_size = soreadfile(fp, mp->b_rptr, fileoff, &error, iosize);
2172 2242
2173 2243 /* Error or Reached EOF ? */
2174 2244 if ((error != 0) || (ret_size == 0)) {
2175 2245 freeb(mp);
2176 2246 break;
2177 2247 }
2178 2248 mp->b_wptr = mp->b_rptr + ret_size;
2179 2249
2180 2250 snf_enque(sr, mp);
2181 2251 size -= ret_size;
2182 2252 fileoff += ret_size;
2183 2253 }
2184 2254 (void) VOP_IOCTL(fp->f_vnode, _FIODIRECTIO, DIRECTIO_OFF, 0,
2185 2255 kcred, NULL, NULL);
2186 2256 mutex_enter(&sr->sr_lock);
2187 2257 sr->sr_read_error = error;
2188 2258 sr->sr_read_error |= SR_READ_DONE;
2189 2259 cv_signal(&sr->sr_cv);
2190 2260 mutex_exit(&sr->sr_lock);
2191 2261 }
2192 2262
2193 2263 void
2194 2264 snf_async_thread(void)
2195 2265 {
2196 2266 snf_req_t *sr;
2197 2267 callb_cpr_t cprinfo;
2198 2268 clock_t time_left = 1;
2199 2269
2200 2270 CALLB_CPR_INIT(&cprinfo, &snfq->snfq_lock, callb_generic_cpr, "snfq");
2201 2271
2202 2272 mutex_enter(&snfq->snfq_lock);
2203 2273 for (;;) {
2204 2274 /*
2205 2275 * If we didn't find a entry, then block until woken up
2206 2276 * again and then look through the queues again.
2207 2277 */
2208 2278 while ((sr = snfq->snfq_req_head) == NULL) {
2209 2279 CALLB_CPR_SAFE_BEGIN(&cprinfo);
2210 2280 if (time_left <= 0) {
2211 2281 snfq->snfq_svc_threads--;
2212 2282 CALLB_CPR_EXIT(&cprinfo);
2213 2283 thread_exit();
2214 2284 /* NOTREACHED */
2215 2285 }
2216 2286 snfq->snfq_idle_cnt++;
2217 2287
2218 2288 time_left = cv_reltimedwait(&snfq->snfq_cv,
2219 2289 &snfq->snfq_lock, snfq_timeout, TR_CLOCK_TICK);
2220 2290 snfq->snfq_idle_cnt--;
2221 2291
2222 2292 CALLB_CPR_SAFE_END(&cprinfo, &snfq->snfq_lock);
2223 2293 }
2224 2294 snfq->snfq_req_head = sr->sr_next;
2225 2295 snfq->snfq_req_cnt--;
2226 2296 mutex_exit(&snfq->snfq_lock);
2227 2297 snf_async_read(sr);
2228 2298 mutex_enter(&snfq->snfq_lock);
2229 2299 }
2230 2300 }
2231 2301
2232 2302
2233 2303 snf_req_t *
2234 2304 create_thread(int operation, struct vnode *vp, file_t *fp,
2235 2305 u_offset_t fileoff, u_offset_t size)
2236 2306 {
2237 2307 snf_req_t *sr;
2238 2308 stdata_t *stp;
2239 2309
2240 2310 sr = (snf_req_t *)kmem_zalloc(sizeof (snf_req_t), KM_SLEEP);
2241 2311
2242 2312 sr->sr_vp = vp;
2243 2313 sr->sr_fp = fp;
2244 2314 stp = vp->v_stream;
2245 2315
2246 2316 /*
2247 2317 * store sd_qn_maxpsz into sr_maxpsz while we have stream head.
2248 2318 * stream might be closed before thread returns from snf_async_read.
2249 2319 */
2250 2320 if (stp != NULL && stp->sd_qn_maxpsz > 0) {
2251 2321 sr->sr_maxpsz = MIN(MAXBSIZE, stp->sd_qn_maxpsz);
2252 2322 } else {
2253 2323 sr->sr_maxpsz = MAXBSIZE;
2254 2324 }
2255 2325
2256 2326 sr->sr_operation = operation;
2257 2327 sr->sr_file_off = fileoff;
2258 2328 sr->sr_file_size = size;
2259 2329 sr->sr_hiwat = sendfile_req_hiwat;
2260 2330 sr->sr_lowat = sendfile_req_lowat;
2261 2331 mutex_init(&sr->sr_lock, NULL, MUTEX_DEFAULT, NULL);
2262 2332 cv_init(&sr->sr_cv, NULL, CV_DEFAULT, NULL);
2263 2333 /*
2264 2334 * See whether we need another thread for servicing this
2265 2335 * request. If there are already enough requests queued
2266 2336 * for the threads, create one if not exceeding
2267 2337 * snfq_max_threads.
2268 2338 */
2269 2339 mutex_enter(&snfq->snfq_lock);
2270 2340 if (snfq->snfq_req_cnt >= snfq->snfq_idle_cnt &&
2271 2341 snfq->snfq_svc_threads < snfq->snfq_max_threads) {
2272 2342 (void) thread_create(NULL, 0, &snf_async_thread, 0, 0, &p0,
2273 2343 TS_RUN, minclsyspri);
2274 2344 snfq->snfq_svc_threads++;
2275 2345 }
2276 2346 if (snfq->snfq_req_head == NULL) {
2277 2347 snfq->snfq_req_head = snfq->snfq_req_tail = sr;
2278 2348 cv_signal(&snfq->snfq_cv);
2279 2349 } else {
2280 2350 snfq->snfq_req_tail->sr_next = sr;
2281 2351 snfq->snfq_req_tail = sr;
2282 2352 }
2283 2353 snfq->snfq_req_cnt++;
2284 2354 mutex_exit(&snfq->snfq_lock);
2285 2355 return (sr);
2286 2356 }
2287 2357
2288 2358 int
2289 2359 snf_direct_io(file_t *fp, file_t *rfp, u_offset_t fileoff, u_offset_t size,
2290 2360 ssize_t *count)
2291 2361 {
2292 2362 snf_req_t *sr;
2293 2363 mblk_t *mp;
2294 2364 int iosize;
2295 2365 int error = 0;
2296 2366 short fflag;
2297 2367 struct vnode *vp;
2298 2368 int ksize;
2299 2369 struct nmsghdr msg;
2300 2370
2301 2371 ksize = 0;
2302 2372 *count = 0;
2303 2373 bzero(&msg, sizeof (msg));
2304 2374
2305 2375 vp = fp->f_vnode;
2306 2376 fflag = fp->f_flag;
2307 2377 if ((sr = create_thread(READ_OP, vp, rfp, fileoff, size)) == NULL)
2308 2378 return (EAGAIN);
2309 2379
2310 2380 /*
2311 2381 * We check for read error in snf_deque. It has to check
2312 2382 * for successful READ_DONE and return NULL, and we might
2313 2383 * as well make an additional check there.
2314 2384 */
2315 2385 while ((mp = snf_deque(sr)) != NULL) {
2316 2386
2317 2387 if (ISSIG(curthread, JUSTLOOKING)) {
2318 2388 freeb(mp);
2319 2389 error = EINTR;
2320 2390 break;
2321 2391 }
2322 2392 iosize = MBLKL(mp);
2323 2393
2324 2394 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2325 2395
2326 2396 if (error != 0) {
2327 2397 if (mp != NULL)
2328 2398 freeb(mp);
2329 2399 break;
2330 2400 }
2331 2401 ksize += iosize;
2332 2402 }
2333 2403 *count = ksize;
2334 2404
2335 2405 mutex_enter(&sr->sr_lock);
2336 2406 sr->sr_write_error = error;
2337 2407 /* Look at the big comments on why we cv_signal here. */
2338 2408 cv_signal(&sr->sr_cv);
2339 2409
2340 2410 /* Wait for the reader to complete always. */
2341 2411 while (!(sr->sr_read_error & SR_READ_DONE)) {
2342 2412 cv_wait(&sr->sr_cv, &sr->sr_lock);
2343 2413 }
2344 2414 /* If there is no write error, check for read error. */
2345 2415 if (error == 0)
2346 2416 error = (sr->sr_read_error & ~SR_READ_DONE);
2347 2417
2348 2418 if (error != 0) {
2349 2419 mblk_t *next_mp;
2350 2420
2351 2421 mp = sr->sr_mp_head;
2352 2422 while (mp != NULL) {
2353 2423 next_mp = mp->b_next;
2354 2424 mp->b_next = NULL;
2355 2425 freeb(mp);
2356 2426 mp = next_mp;
2357 2427 }
2358 2428 }
2359 2429 mutex_exit(&sr->sr_lock);
2360 2430 kmem_free(sr, sizeof (snf_req_t));
2361 2431 return (error);
2362 2432 }
2363 2433
2364 2434 /* Maximum no.of pages allocated by vpm for sendfile at a time */
2365 2435 #define SNF_VPMMAXPGS (VPMMAXPGS/2)
2366 2436
2367 2437 /*
2368 2438 * Maximum no.of elements in the list returned by vpm, including
2369 2439 * NULL for the last entry
2370 2440 */
2371 2441 #define SNF_MAXVMAPS (SNF_VPMMAXPGS + 1)
2372 2442
2373 2443 typedef struct {
2374 2444 unsigned int snfv_ref;
2375 2445 frtn_t snfv_frtn;
2376 2446 vnode_t *snfv_vp;
2377 2447 struct vmap snfv_vml[SNF_MAXVMAPS];
2378 2448 } snf_vmap_desbinfo;
2379 2449
2380 2450 typedef struct {
2381 2451 frtn_t snfi_frtn;
2382 2452 caddr_t snfi_base;
2383 2453 uint_t snfi_mapoff;
2384 2454 size_t snfi_len;
2385 2455 vnode_t *snfi_vp;
2386 2456 } snf_smap_desbinfo;
2387 2457
2388 2458 /*
2389 2459 * The callback function used for vpm mapped mblks called when the last ref of
2390 2460 * the mblk is dropped which normally occurs when TCP receives the ack. But it
2391 2461 * can be the driver too due to lazy reclaim.
2392 2462 */
2393 2463 void
2394 2464 snf_vmap_desbfree(snf_vmap_desbinfo *snfv)
2395 2465 {
2396 2466 ASSERT(snfv->snfv_ref != 0);
2397 2467 if (atomic_dec_32_nv(&snfv->snfv_ref) == 0) {
2398 2468 vpm_unmap_pages(snfv->snfv_vml, S_READ);
2399 2469 VN_RELE(snfv->snfv_vp);
2400 2470 kmem_free(snfv, sizeof (snf_vmap_desbinfo));
2401 2471 }
2402 2472 }
2403 2473
2404 2474 /*
2405 2475 * The callback function used for segmap'ped mblks called when the last ref of
2406 2476 * the mblk is dropped which normally occurs when TCP receives the ack. But it
2407 2477 * can be the driver too due to lazy reclaim.
2408 2478 */
2409 2479 void
2410 2480 snf_smap_desbfree(snf_smap_desbinfo *snfi)
2411 2481 {
2412 2482 if (! IS_KPM_ADDR(snfi->snfi_base)) {
2413 2483 /*
2414 2484 * We don't need to call segmap_fault(F_SOFTUNLOCK) for
2415 2485 * segmap_kpm as long as the latter never falls back to
2416 2486 * "use_segmap_range". (See segmap_getmapflt().)
2417 2487 *
2418 2488 * Using S_OTHER saves an redundant hat_setref() in
2419 2489 * segmap_unlock()
2420 2490 */
2421 2491 (void) segmap_fault(kas.a_hat, segkmap,
2422 2492 (caddr_t)(uintptr_t)(((uintptr_t)snfi->snfi_base +
2423 2493 snfi->snfi_mapoff) & PAGEMASK), snfi->snfi_len,
2424 2494 F_SOFTUNLOCK, S_OTHER);
2425 2495 }
2426 2496 (void) segmap_release(segkmap, snfi->snfi_base, SM_DONTNEED);
2427 2497 VN_RELE(snfi->snfi_vp);
2428 2498 kmem_free(snfi, sizeof (*snfi));
2429 2499 }
2430 2500
2431 2501 /*
2432 2502 * Use segmap or vpm instead of bcopy to send down a desballoca'ed, mblk.
2433 2503 * When segmap is used, the mblk contains a segmap slot of no more
2434 2504 * than MAXBSIZE.
2435 2505 *
2436 2506 * With vpm, a maximum of SNF_MAXVMAPS page-sized mappings can be obtained
2437 2507 * in each iteration and sent by socket_sendmblk until an error occurs or
2438 2508 * the requested size has been transferred. An mblk is esballoca'ed from
2439 2509 * each mapped page and a chain of these mblk is sent to the transport layer.
2440 2510 * vpm will be called to unmap the pages when all mblks have been freed by
2441 2511 * free_func.
2442 2512 *
2443 2513 * At the end of the whole sendfile() operation, we wait till the data from
2444 2514 * the last mblk is ack'ed by the transport before returning so that the
2445 2515 * caller of sendfile() can safely modify the file content.
2446 2516 *
2447 2517 * The caller of this function should make sure that total_size does not exceed
2448 2518 * the actual file size of fvp.
2449 2519 */
2450 2520 int
2451 2521 snf_segmap(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t total_size,
2452 2522 ssize_t *count, boolean_t nowait)
2453 2523 {
2454 2524 caddr_t base;
2455 2525 int mapoff;
2456 2526 vnode_t *vp;
2457 2527 mblk_t *mp = NULL;
2458 2528 int chain_size;
2459 2529 int error;
2460 2530 clock_t deadlk_wait;
2461 2531 short fflag;
2462 2532 int ksize;
2463 2533 struct vattr va;
2464 2534 boolean_t dowait = B_FALSE;
2465 2535 struct nmsghdr msg;
2466 2536
2467 2537 vp = fp->f_vnode;
2468 2538 fflag = fp->f_flag;
2469 2539 ksize = 0;
2470 2540 bzero(&msg, sizeof (msg));
2471 2541
2472 2542 for (;;) {
2473 2543 if (ISSIG(curthread, JUSTLOOKING)) {
2474 2544 error = EINTR;
2475 2545 break;
2476 2546 }
2477 2547
2478 2548 if (vpm_enable) {
2479 2549 snf_vmap_desbinfo *snfv;
2480 2550 mblk_t *nmp;
2481 2551 int mblk_size;
2482 2552 int maxsize;
2483 2553 int i;
2484 2554
2485 2555 mapoff = fileoff & PAGEOFFSET;
2486 2556 maxsize = MIN((SNF_VPMMAXPGS * PAGESIZE), total_size);
2487 2557
2488 2558 snfv = kmem_zalloc(sizeof (snf_vmap_desbinfo),
2489 2559 KM_SLEEP);
2490 2560
2491 2561 /*
2492 2562 * Get vpm mappings for maxsize with read access.
2493 2563 * If the pages aren't available yet, we get
2494 2564 * DEADLK, so wait and try again a little later using
2495 2565 * an increasing wait. We might be here a long time.
2496 2566 *
2497 2567 * If delay_sig returns EINTR, be sure to exit and
2498 2568 * pass it up to the caller.
2499 2569 */
2500 2570 deadlk_wait = 0;
2501 2571 while ((error = vpm_map_pages(fvp, fileoff,
2502 2572 (size_t)maxsize, (VPM_FETCHPAGE), snfv->snfv_vml,
2503 2573 SNF_MAXVMAPS, NULL, S_READ)) == EDEADLK) {
2504 2574 deadlk_wait += (deadlk_wait < 5) ? 1 : 4;
2505 2575 if ((error = delay_sig(deadlk_wait)) != 0) {
2506 2576 break;
2507 2577 }
2508 2578 }
2509 2579 if (error != 0) {
2510 2580 kmem_free(snfv, sizeof (snf_vmap_desbinfo));
2511 2581 error = (error == EINTR) ? EINTR : EIO;
2512 2582 goto out;
2513 2583 }
2514 2584 snfv->snfv_frtn.free_func = snf_vmap_desbfree;
2515 2585 snfv->snfv_frtn.free_arg = (caddr_t)snfv;
2516 2586
2517 2587 /* Construct the mblk chain from the page mappings */
2518 2588 chain_size = 0;
2519 2589 for (i = 0; (snfv->snfv_vml[i].vs_addr != NULL) &&
2520 2590 total_size > 0; i++) {
2521 2591 ASSERT(chain_size < maxsize);
2522 2592 mblk_size = MIN(snfv->snfv_vml[i].vs_len -
2523 2593 mapoff, total_size);
2524 2594 nmp = esballoca(
2525 2595 (uchar_t *)snfv->snfv_vml[i].vs_addr +
2526 2596 mapoff, mblk_size, BPRI_HI,
2527 2597 &snfv->snfv_frtn);
2528 2598
2529 2599 /*
2530 2600 * We return EAGAIN after unmapping the pages
2531 2601 * if we cannot allocate the the head of the
2532 2602 * chain. Otherwise, we continue sending the
2533 2603 * mblks constructed so far.
2534 2604 */
2535 2605 if (nmp == NULL) {
2536 2606 if (i == 0) {
2537 2607 vpm_unmap_pages(snfv->snfv_vml,
2538 2608 S_READ);
2539 2609 kmem_free(snfv,
2540 2610 sizeof (snf_vmap_desbinfo));
2541 2611 error = EAGAIN;
2542 2612 goto out;
2543 2613 }
2544 2614 break;
2545 2615 }
2546 2616 /* Mark this dblk with the zero-copy flag */
2547 2617 nmp->b_datap->db_struioflag |= STRUIO_ZC;
2548 2618 nmp->b_wptr += mblk_size;
2549 2619 chain_size += mblk_size;
2550 2620 fileoff += mblk_size;
2551 2621 total_size -= mblk_size;
2552 2622 snfv->snfv_ref++;
2553 2623 mapoff = 0;
2554 2624 if (i > 0)
2555 2625 linkb(mp, nmp);
2556 2626 else
2557 2627 mp = nmp;
2558 2628 }
2559 2629 VN_HOLD(fvp);
2560 2630 snfv->snfv_vp = fvp;
2561 2631 } else {
2562 2632 /* vpm not supported. fallback to segmap */
2563 2633 snf_smap_desbinfo *snfi;
2564 2634
2565 2635 mapoff = fileoff & MAXBOFFSET;
2566 2636 chain_size = MAXBSIZE - mapoff;
2567 2637 if (chain_size > total_size)
2568 2638 chain_size = total_size;
2569 2639 /*
2570 2640 * we don't forcefault because we'll call
2571 2641 * segmap_fault(F_SOFTLOCK) next.
2572 2642 *
2573 2643 * S_READ will get the ref bit set (by either
2574 2644 * segmap_getmapflt() or segmap_fault()) and page
2575 2645 * shared locked.
2576 2646 */
2577 2647 base = segmap_getmapflt(segkmap, fvp, fileoff,
2578 2648 chain_size, segmap_kpm ? SM_FAULT : 0, S_READ);
2579 2649
2580 2650 snfi = kmem_alloc(sizeof (*snfi), KM_SLEEP);
2581 2651 snfi->snfi_len = (size_t)roundup(mapoff+chain_size,
2582 2652 PAGESIZE)- (mapoff & PAGEMASK);
2583 2653 /*
2584 2654 * We must call segmap_fault() even for segmap_kpm
2585 2655 * because that's how error gets returned.
2586 2656 * (segmap_getmapflt() never fails but segmap_fault()
2587 2657 * does.)
2588 2658 *
2589 2659 * If the pages aren't available yet, we get
2590 2660 * DEADLK, so wait and try again a little later using
2591 2661 * an increasing wait. We might be here a long time.
2592 2662 *
2593 2663 * If delay_sig returns EINTR, be sure to exit and
2594 2664 * pass it up to the caller.
2595 2665 */
2596 2666 deadlk_wait = 0;
2597 2667 while ((error = FC_ERRNO(segmap_fault(kas.a_hat,
2598 2668 segkmap, (caddr_t)(uintptr_t)(((uintptr_t)base +
2599 2669 mapoff) & PAGEMASK), snfi->snfi_len, F_SOFTLOCK,
2600 2670 S_READ))) == EDEADLK) {
2601 2671 deadlk_wait += (deadlk_wait < 5) ? 1 : 4;
2602 2672 if ((error = delay_sig(deadlk_wait)) != 0) {
2603 2673 break;
2604 2674 }
2605 2675 }
2606 2676 if (error != 0) {
2607 2677 (void) segmap_release(segkmap, base, 0);
2608 2678 kmem_free(snfi, sizeof (*snfi));
2609 2679 error = (error == EINTR) ? EINTR : EIO;
2610 2680 goto out;
2611 2681 }
2612 2682 snfi->snfi_frtn.free_func = snf_smap_desbfree;
2613 2683 snfi->snfi_frtn.free_arg = (caddr_t)snfi;
2614 2684 snfi->snfi_base = base;
2615 2685 snfi->snfi_mapoff = mapoff;
2616 2686 mp = esballoca((uchar_t *)base + mapoff, chain_size,
2617 2687 BPRI_HI, &snfi->snfi_frtn);
2618 2688
2619 2689 if (mp == NULL) {
2620 2690 (void) segmap_fault(kas.a_hat, segkmap,
2621 2691 (caddr_t)(uintptr_t)(((uintptr_t)base +
2622 2692 mapoff) & PAGEMASK), snfi->snfi_len,
2623 2693 F_SOFTUNLOCK, S_OTHER);
2624 2694 (void) segmap_release(segkmap, base, 0);
2625 2695 kmem_free(snfi, sizeof (*snfi));
2626 2696 freemsg(mp);
2627 2697 error = EAGAIN;
2628 2698 goto out;
2629 2699 }
2630 2700 VN_HOLD(fvp);
2631 2701 snfi->snfi_vp = fvp;
2632 2702 mp->b_wptr += chain_size;
2633 2703
2634 2704 /* Mark this dblk with the zero-copy flag */
2635 2705 mp->b_datap->db_struioflag |= STRUIO_ZC;
2636 2706 fileoff += chain_size;
2637 2707 total_size -= chain_size;
2638 2708 }
2639 2709
2640 2710 if (total_size == 0 && !nowait) {
2641 2711 ASSERT(!dowait);
2642 2712 dowait = B_TRUE;
2643 2713 mp->b_datap->db_struioflag |= STRUIO_ZCNOTIFY;
2644 2714 }
2645 2715 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2646 2716 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2647 2717 if (error != 0) {
2648 2718 /*
2649 2719 * mp contains the mblks that were not sent by
2650 2720 * socket_sendmblk. Use its size to update *count
2651 2721 */
2652 2722 *count = ksize + (chain_size - msgdsize(mp));
2653 2723 if (mp != NULL)
2654 2724 freemsg(mp);
2655 2725 return (error);
2656 2726 }
2657 2727 ksize += chain_size;
2658 2728 if (total_size == 0)
2659 2729 goto done;
2660 2730
2661 2731 (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2662 2732 va.va_mask = AT_SIZE;
2663 2733 error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2664 2734 if (error)
2665 2735 break;
2666 2736 /* Read as much as possible. */
2667 2737 if (fileoff >= va.va_size)
2668 2738 break;
2669 2739 if (total_size + fileoff > va.va_size)
2670 2740 total_size = va.va_size - fileoff;
2671 2741 }
2672 2742 out:
2673 2743 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2674 2744 done:
2675 2745 *count = ksize;
2676 2746 if (dowait) {
2677 2747 stdata_t *stp;
2678 2748
2679 2749 stp = vp->v_stream;
2680 2750 if (stp == NULL) {
2681 2751 struct sonode *so;
2682 2752 so = VTOSO(vp);
2683 2753 error = so_zcopy_wait(so);
2684 2754 } else {
2685 2755 mutex_enter(&stp->sd_lock);
2686 2756 while (!(stp->sd_flag & STZCNOTIFY)) {
2687 2757 if (cv_wait_sig(&stp->sd_zcopy_wait,
2688 2758 &stp->sd_lock) == 0) {
2689 2759 error = EINTR;
2690 2760 break;
2691 2761 }
2692 2762 }
2693 2763 stp->sd_flag &= ~STZCNOTIFY;
2694 2764 mutex_exit(&stp->sd_lock);
2695 2765 }
2696 2766 }
2697 2767 return (error);
2698 2768 }
2699 2769
2700 2770 int
2701 2771 snf_cache(file_t *fp, vnode_t *fvp, u_offset_t fileoff, u_offset_t size,
2702 2772 uint_t maxpsz, ssize_t *count)
2703 2773 {
2704 2774 struct vnode *vp;
2705 2775 mblk_t *mp;
2706 2776 int iosize;
2707 2777 int extra = 0;
2708 2778 int error;
2709 2779 short fflag;
2710 2780 int ksize;
2711 2781 int ioflag;
2712 2782 struct uio auio;
2713 2783 struct iovec aiov;
2714 2784 struct vattr va;
2715 2785 int maxblk = 0;
2716 2786 int wroff = 0;
2717 2787 struct sonode *so;
2718 2788 struct nmsghdr msg;
2719 2789
2720 2790 vp = fp->f_vnode;
2721 2791 if (vp->v_type == VSOCK) {
2722 2792 stdata_t *stp;
2723 2793
2724 2794 /*
2725 2795 * Get the extra space to insert a header and a trailer.
2726 2796 */
2727 2797 so = VTOSO(vp);
2728 2798 stp = vp->v_stream;
2729 2799 if (stp == NULL) {
2730 2800 wroff = so->so_proto_props.sopp_wroff;
2731 2801 maxblk = so->so_proto_props.sopp_maxblk;
2732 2802 extra = wroff + so->so_proto_props.sopp_tail;
2733 2803 } else {
2734 2804 wroff = (int)(stp->sd_wroff);
2735 2805 maxblk = (int)(stp->sd_maxblk);
2736 2806 extra = wroff + (int)(stp->sd_tail);
2737 2807 }
2738 2808 }
2739 2809 bzero(&msg, sizeof (msg));
2740 2810 fflag = fp->f_flag;
2741 2811 ksize = 0;
2742 2812 auio.uio_iov = &aiov;
2743 2813 auio.uio_iovcnt = 1;
2744 2814 auio.uio_segflg = UIO_SYSSPACE;
2745 2815 auio.uio_llimit = MAXOFFSET_T;
2746 2816 auio.uio_fmode = fflag;
2747 2817 auio.uio_extflg = UIO_COPY_CACHED;
2748 2818 ioflag = auio.uio_fmode & (FSYNC|FDSYNC|FRSYNC);
2749 2819 /* If read sync is not asked for, filter sync flags */
2750 2820 if ((ioflag & FRSYNC) == 0)
2751 2821 ioflag &= ~(FSYNC|FDSYNC);
2752 2822 for (;;) {
2753 2823 if (ISSIG(curthread, JUSTLOOKING)) {
2754 2824 error = EINTR;
2755 2825 break;
2756 2826 }
2757 2827 iosize = (int)MIN(maxpsz, size);
2758 2828
2759 2829 /*
2760 2830 * Socket filters can limit the mblk size,
2761 2831 * so limit reads to maxblk if there are
2762 2832 * filters present.
2763 2833 */
2764 2834 if (vp->v_type == VSOCK &&
2765 2835 so->so_filter_active > 0 && maxblk != INFPSZ)
2766 2836 iosize = (int)MIN(iosize, maxblk);
2767 2837
2768 2838 if (is_system_labeled()) {
2769 2839 mp = allocb_cred(iosize + extra, CRED(),
2770 2840 curproc->p_pid);
2771 2841 } else {
2772 2842 mp = allocb(iosize + extra, BPRI_MED);
2773 2843 }
2774 2844 if (mp == NULL) {
2775 2845 error = EAGAIN;
2776 2846 break;
2777 2847 }
2778 2848
2779 2849 mp->b_rptr += wroff;
2780 2850
2781 2851 aiov.iov_base = (caddr_t)mp->b_rptr;
2782 2852 aiov.iov_len = iosize;
2783 2853 auio.uio_loffset = fileoff;
2784 2854 auio.uio_resid = iosize;
2785 2855
2786 2856 error = VOP_READ(fvp, &auio, ioflag, fp->f_cred, NULL);
2787 2857 iosize -= auio.uio_resid;
2788 2858
2789 2859 if (error == EINTR && iosize != 0)
2790 2860 error = 0;
2791 2861
2792 2862 if (error != 0 || iosize == 0) {
2793 2863 freeb(mp);
2794 2864 break;
2795 2865 }
2796 2866 mp->b_wptr = mp->b_rptr + iosize;
2797 2867
2798 2868 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2799 2869
2800 2870 error = socket_sendmblk(VTOSO(vp), &msg, fflag, CRED(), &mp);
2801 2871
2802 2872 if (error != 0) {
2803 2873 *count = ksize;
2804 2874 if (mp != NULL)
2805 2875 freeb(mp);
2806 2876 return (error);
2807 2877 }
2808 2878 ksize += iosize;
2809 2879 size -= iosize;
2810 2880 if (size == 0)
2811 2881 goto done;
2812 2882
2813 2883 fileoff += iosize;
2814 2884 (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2815 2885 va.va_mask = AT_SIZE;
2816 2886 error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2817 2887 if (error)
2818 2888 break;
2819 2889 /* Read as much as possible. */
2820 2890 if (fileoff >= va.va_size)
2821 2891 size = 0;
2822 2892 else if (size + fileoff > va.va_size)
2823 2893 size = va.va_size - fileoff;
2824 2894 }
2825 2895 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2826 2896 done:
2827 2897 *count = ksize;
2828 2898 return (error);
2829 2899 }
2830 2900
2831 2901 #if defined(_SYSCALL32_IMPL) || defined(_ILP32)
2832 2902 /*
2833 2903 * Largefile support for 32 bit applications only.
2834 2904 */
2835 2905 int
2836 2906 sosendfile64(file_t *fp, file_t *rfp, const struct ksendfilevec64 *sfv,
2837 2907 ssize32_t *count32)
2838 2908 {
2839 2909 ssize32_t sfv_len;
2840 2910 u_offset_t sfv_off, va_size;
2841 2911 struct vnode *vp, *fvp, *realvp;
2842 2912 struct vattr va;
2843 2913 stdata_t *stp;
2844 2914 ssize_t count = 0;
2845 2915 int error = 0;
2846 2916 boolean_t dozcopy = B_FALSE;
2847 2917 uint_t maxpsz;
2848 2918
2849 2919 sfv_len = (ssize32_t)sfv->sfv_len;
2850 2920 if (sfv_len < 0) {
2851 2921 error = EINVAL;
2852 2922 goto out;
2853 2923 }
2854 2924
2855 2925 if (sfv_len == 0) goto out;
2856 2926
2857 2927 sfv_off = (u_offset_t)sfv->sfv_off;
2858 2928
2859 2929 /* Same checks as in pread */
2860 2930 if (sfv_off > MAXOFFSET_T) {
2861 2931 error = EINVAL;
2862 2932 goto out;
2863 2933 }
2864 2934 if (sfv_off + sfv_len > MAXOFFSET_T)
2865 2935 sfv_len = (ssize32_t)(MAXOFFSET_T - sfv_off);
2866 2936
2867 2937 /*
2868 2938 * There are no more checks on sfv_len. So, we cast it to
2869 2939 * u_offset_t and share the snf_direct_io/snf_cache code between
2870 2940 * 32 bit and 64 bit.
2871 2941 *
2872 2942 * TODO: should do nbl_need_check() like read()?
2873 2943 */
2874 2944 if (sfv_len > sendfile_max_size) {
2875 2945 sf_stats.ss_file_not_cached++;
2876 2946 error = snf_direct_io(fp, rfp, sfv_off, (u_offset_t)sfv_len,
2877 2947 &count);
2878 2948 goto out;
2879 2949 }
2880 2950 fvp = rfp->f_vnode;
2881 2951 if (VOP_REALVP(fvp, &realvp, NULL) == 0)
2882 2952 fvp = realvp;
2883 2953 /*
2884 2954 * Grab the lock as a reader to prevent the file size
2885 2955 * from changing underneath.
2886 2956 */
2887 2957 (void) VOP_RWLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2888 2958 va.va_mask = AT_SIZE;
2889 2959 error = VOP_GETATTR(fvp, &va, 0, kcred, NULL);
2890 2960 va_size = va.va_size;
2891 2961 if ((error != 0) || (va_size == 0) || (sfv_off >= va_size)) {
2892 2962 VOP_RWUNLOCK(fvp, V_WRITELOCK_FALSE, NULL);
2893 2963 goto out;
2894 2964 }
2895 2965 /* Read as much as possible. */
2896 2966 if (sfv_off + sfv_len > va_size)
2897 2967 sfv_len = va_size - sfv_off;
2898 2968
2899 2969 vp = fp->f_vnode;
2900 2970 stp = vp->v_stream;
2901 2971 /*
2902 2972 * When the NOWAIT flag is not set, we enable zero-copy only if the
2903 2973 * transfer size is large enough. This prevents performance loss
2904 2974 * when the caller sends the file piece by piece.
2905 2975 */
2906 2976 if (sfv_len >= MAXBSIZE && (sfv_len >= (va_size >> 1) ||
2907 2977 (sfv->sfv_flag & SFV_NOWAIT) || sfv_len >= 0x1000000) &&
2908 2978 !vn_has_flocks(fvp) && !(fvp->v_flag & VNOMAP)) {
2909 2979 uint_t copyflag;
2910 2980 copyflag = stp != NULL ? stp->sd_copyflag :
2911 2981 VTOSO(vp)->so_proto_props.sopp_zcopyflag;
2912 2982 if ((copyflag & (STZCVMSAFE|STZCVMUNSAFE)) == 0) {
2913 2983 int on = 1;
2914 2984
2915 2985 if (socket_setsockopt(VTOSO(vp), SOL_SOCKET,
2916 2986 SO_SND_COPYAVOID, &on, sizeof (on), CRED()) == 0)
2917 2987 dozcopy = B_TRUE;
2918 2988 } else {
2919 2989 dozcopy = copyflag & STZCVMSAFE;
2920 2990 }
2921 2991 }
2922 2992 if (dozcopy) {
2923 2993 sf_stats.ss_file_segmap++;
2924 2994 error = snf_segmap(fp, fvp, sfv_off, (u_offset_t)sfv_len,
2925 2995 &count, ((sfv->sfv_flag & SFV_NOWAIT) != 0));
2926 2996 } else {
2927 2997 if (vp->v_type == VSOCK && stp == NULL) {
2928 2998 sonode_t *so = VTOSO(vp);
2929 2999 maxpsz = so->so_proto_props.sopp_maxpsz;
2930 3000 } else if (stp != NULL) {
2931 3001 maxpsz = stp->sd_qn_maxpsz;
2932 3002 } else {
2933 3003 maxpsz = maxphys;
2934 3004 }
2935 3005
2936 3006 if (maxpsz == INFPSZ)
2937 3007 maxpsz = maxphys;
2938 3008 else
2939 3009 maxpsz = roundup(maxpsz, MAXBSIZE);
2940 3010 sf_stats.ss_file_cached++;
2941 3011 error = snf_cache(fp, fvp, sfv_off, (u_offset_t)sfv_len,
2942 3012 maxpsz, &count);
2943 3013 }
2944 3014 out:
2945 3015 releasef(sfv->sfv_fd);
2946 3016 *count32 = (ssize32_t)count;
2947 3017 return (error);
2948 3018 }
2949 3019 #endif
2950 3020
2951 3021 #ifdef _SYSCALL32_IMPL
2952 3022 /*
2953 3023 * recv32(), recvfrom32(), send32(), sendto32(): intentionally return a
2954 3024 * ssize_t rather than ssize32_t; see the comments above read32 for details.
2955 3025 */
2956 3026
2957 3027 ssize_t
2958 3028 recv32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags)
2959 3029 {
2960 3030 return (recv(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags));
2961 3031 }
2962 3032
2963 3033 ssize_t
2964 3034 recvfrom32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags,
2965 3035 caddr32_t name, caddr32_t namelenp)
2966 3036 {
2967 3037 return (recvfrom(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags,
2968 3038 (void *)(uintptr_t)name, (void *)(uintptr_t)namelenp));
2969 3039 }
2970 3040
2971 3041 ssize_t
2972 3042 send32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags)
2973 3043 {
2974 3044 return (send(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags));
2975 3045 }
2976 3046
2977 3047 ssize_t
2978 3048 sendto32(int32_t sock, caddr32_t buffer, size32_t len, int32_t flags,
2979 3049 caddr32_t name, socklen_t namelen)
2980 3050 {
2981 3051 return (sendto(sock, (void *)(uintptr_t)buffer, (ssize32_t)len, flags,
2982 3052 (void *)(uintptr_t)name, namelen));
2983 3053 }
2984 3054 #endif /* _SYSCALL32_IMPL */
2985 3055
2986 3056 /*
2987 3057 * Function wrappers (mostly around the sonode switch) for
2988 3058 * backward compatibility.
2989 3059 */
2990 3060
2991 3061 int
2992 3062 soaccept(struct sonode *so, int fflag, struct sonode **nsop)
2993 3063 {
2994 3064 return (socket_accept(so, fflag, CRED(), nsop));
2995 3065 }
2996 3066
2997 3067 int
2998 3068 sobind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
2999 3069 int backlog, int flags)
3000 3070 {
3001 3071 int error;
3002 3072
3003 3073 error = socket_bind(so, name, namelen, flags, CRED());
3004 3074 if (error == 0 && backlog != 0)
3005 3075 return (socket_listen(so, backlog, CRED()));
3006 3076
3007 3077 return (error);
3008 3078 }
3009 3079
3010 3080 int
3011 3081 solisten(struct sonode *so, int backlog)
3012 3082 {
3013 3083 return (socket_listen(so, backlog, CRED()));
3014 3084 }
3015 3085
3016 3086 int
3017 3087 soconnect(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3018 3088 int fflag, int flags)
3019 3089 {
3020 3090 return (socket_connect(so, name, namelen, fflag, flags, CRED()));
3021 3091 }
3022 3092
3023 3093 int
3024 3094 sorecvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
3025 3095 {
3026 3096 return (socket_recvmsg(so, msg, uiop, CRED()));
3027 3097 }
3028 3098
3029 3099 int
3030 3100 sosendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop)
3031 3101 {
3032 3102 return (socket_sendmsg(so, msg, uiop, CRED()));
3033 3103 }
3034 3104
3035 3105 int
3036 3106 soshutdown(struct sonode *so, int how)
3037 3107 {
3038 3108 return (socket_shutdown(so, how, CRED()));
3039 3109 }
3040 3110
3041 3111 int
3042 3112 sogetsockopt(struct sonode *so, int level, int option_name, void *optval,
3043 3113 socklen_t *optlenp, int flags)
3044 3114 {
3045 3115 return (socket_getsockopt(so, level, option_name, optval, optlenp,
3046 3116 flags, CRED()));
3047 3117 }
3048 3118
3049 3119 int
3050 3120 sosetsockopt(struct sonode *so, int level, int option_name, const void *optval,
3051 3121 t_uscalar_t optlen)
3052 3122 {
3053 3123 return (socket_setsockopt(so, level, option_name, optval, optlen,
3054 3124 CRED()));
3055 3125 }
3056 3126
3057 3127 /*
3058 3128 * Because this is backward compatibility interface it only needs to be
3059 3129 * able to handle the creation of TPI sockfs sockets.
3060 3130 */
3061 3131 struct sonode *
3062 3132 socreate(struct sockparams *sp, int family, int type, int protocol, int version,
3063 3133 int *errorp)
3064 3134 {
3065 3135 struct sonode *so;
3066 3136
3067 3137 ASSERT(sp != NULL);
3068 3138
3069 3139 so = sp->sp_smod_info->smod_sock_create_func(sp, family, type, protocol,
3070 3140 version, SOCKET_SLEEP, errorp, CRED());
3071 3141 if (so == NULL) {
3072 3142 SOCKPARAMS_DEC_REF(sp);
3073 3143 } else {
3074 3144 if ((*errorp = SOP_INIT(so, NULL, CRED(), SOCKET_SLEEP)) == 0) {
3075 3145 /* Cannot fail, only bumps so_count */
3076 3146 (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, CRED(), NULL);
3077 3147 } else {
3078 3148 socket_destroy(so);
3079 3149 so = NULL;
3080 3150 }
3081 3151 }
3082 3152 return (so);
3083 3153 }
↓ open down ↓ |
1706 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX