illumos-gate Wdiff usr/src/uts/common/fs/ufs/ufs_directio.c

Print this page

11909 THREAD_KPRI_RELEASE does nothing of the sort
Reviewed by: Bryan Cantrill <bryan@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/ufs/ufs_directio.c
          +++ new/usr/src/uts/common/fs/ufs/ufs_directio.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each

↓ open down ↓

13 lines elided

↑ open up ↑

  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
       24 + * Copyright 2019 Joyent, Inc.
  24   25   */
  25   26  
  26   27  /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  27   28  /* All Rights Reserved */
  28   29  
  29   30  /*
  30   31   * Portions of this source code were derived from Berkeley 4.3 BSD
  31   32   * under license from the Regents of the University of California.
  32   33   */
  33   34

  34   35  #include <sys/types.h>
  35   36  #include <sys/t_lock.h>
  36   37  #include <sys/param.h>
  37   38  #include <sys/time.h>
  38   39  #include <sys/systm.h>
  39   40  #include <sys/sysmacros.h>
  40   41  #include <sys/resource.h>
  41   42  #include <sys/signal.h>
  42   43  #include <sys/cred.h>
  43   44  #include <sys/user.h>
  44   45  #include <sys/buf.h>
  45   46  #include <sys/vfs.h>
  46   47  #include <sys/vnode.h>
  47   48  #include <sys/proc.h>
  48   49  #include <sys/disp.h>
  49   50  #include <sys/file.h>
  50   51  #include <sys/fcntl.h>
  51   52  #include <sys/flock.h>
  52   53  #include <sys/kmem.h>
  53   54  #include <sys/uio.h>
  54   55  #include <sys/dnlc.h>
  55   56  #include <sys/conf.h>
  56   57  #include <sys/mman.h>
  57   58  #include <sys/pathname.h>
  58   59  #include <sys/debug.h>
  59   60  #include <sys/vmsystm.h>
  60   61  #include <sys/cmn_err.h>
  61   62  #include <sys/filio.h>
  62   63  #include <sys/atomic.h>
  63   64  
  64   65  #include <sys/fssnap_if.h>
  65   66  #include <sys/fs/ufs_fs.h>
  66   67  #include <sys/fs/ufs_lockfs.h>
  67   68  #include <sys/fs/ufs_filio.h>
  68   69  #include <sys/fs/ufs_inode.h>
  69   70  #include <sys/fs/ufs_fsdir.h>
  70   71  #include <sys/fs/ufs_quota.h>
  71   72  #include <sys/fs/ufs_trans.h>
  72   73  #include <sys/fs/ufs_panic.h>
  73   74  #include <sys/dirent.h>         /* must be AFTER <sys/fs/fsdir.h>! */
  74   75  #include <sys/errno.h>
  75   76  
  76   77  #include <sys/filio.h>          /* _FIOIO */
  77   78  
  78   79  #include <vm/hat.h>
  79   80  #include <vm/page.h>
  80   81  #include <vm/pvn.h>
  81   82  #include <vm/as.h>
  82   83  #include <vm/seg.h>
  83   84  #include <vm/seg_map.h>
  84   85  #include <vm/seg_vn.h>
  85   86  #include <vm/seg_kmem.h>
  86   87  #include <vm/rm.h>
  87   88  #include <sys/swap.h>
  88   89  #include <sys/epm.h>
  89   90  
  90   91  #include <fs/fs_subr.h>
  91   92  
  92   93  static void     *ufs_directio_zero_buf;
  93   94  static int      ufs_directio_zero_len   = 8192;
  94   95  
  95   96  int     ufs_directio_enabled = 1;       /* feature is enabled */
  96   97  
  97   98  /*
  98   99   * for kstats reader
  99  100   */
 100  101  struct ufs_directio_kstats {
 101  102          kstat_named_t   logical_reads;
 102  103          kstat_named_t   phys_reads;
 103  104          kstat_named_t   hole_reads;
 104  105          kstat_named_t   nread;
 105  106          kstat_named_t   logical_writes;
 106  107          kstat_named_t   phys_writes;
 107  108          kstat_named_t   nwritten;
 108  109          kstat_named_t   nflushes;
 109  110  } ufs_directio_kstats = {
 110  111          { "logical_reads",      KSTAT_DATA_UINT64 },
 111  112          { "phys_reads",         KSTAT_DATA_UINT64 },
 112  113          { "hole_reads",         KSTAT_DATA_UINT64 },
 113  114          { "nread",              KSTAT_DATA_UINT64 },
 114  115          { "logical_writes",     KSTAT_DATA_UINT64 },
 115  116          { "phys_writes",        KSTAT_DATA_UINT64 },
 116  117          { "nwritten",           KSTAT_DATA_UINT64 },
 117  118          { "nflushes",           KSTAT_DATA_UINT64 },
 118  119  };
 119  120  
 120  121  kstat_t *ufs_directio_kstatsp;
 121  122  
 122  123  /*
 123  124   * use kmem_cache_create for direct-physio buffers. This has shown
 124  125   * a better cache distribution compared to buffers on the
 125  126   * stack. It also avoids semaphore construction/deconstruction
 126  127   * per request
 127  128   */
 128  129  struct directio_buf {
 129  130          struct directio_buf     *next;
 130  131          char            *addr;
 131  132          size_t          nbytes;
 132  133          struct buf      buf;
 133  134  };
 134  135  static struct kmem_cache *directio_buf_cache;
 135  136  
 136  137  
 137  138  /* ARGSUSED */
 138  139  static int
 139  140  directio_buf_constructor(void *dbp, void *cdrarg, int kmflags)
 140  141  {
 141  142          bioinit((struct buf *)&((struct directio_buf *)dbp)->buf);
 142  143          return (0);
 143  144  }
 144  145  
 145  146  /* ARGSUSED */
 146  147  static void
 147  148  directio_buf_destructor(void *dbp, void *cdrarg)
 148  149  {
 149  150          biofini((struct buf *)&((struct directio_buf *)dbp)->buf);
 150  151  }
 151  152  
 152  153  void
 153  154  directio_bufs_init(void)
 154  155  {
 155  156          directio_buf_cache = kmem_cache_create("directio_buf_cache",
 156  157              sizeof (struct directio_buf), 0,
 157  158              directio_buf_constructor, directio_buf_destructor,
 158  159              NULL, NULL, NULL, 0);
 159  160  }
 160  161  
 161  162  void
 162  163  ufs_directio_init(void)
 163  164  {
 164  165          /*
 165  166           * kstats
 166  167           */
 167  168          ufs_directio_kstatsp = kstat_create("ufs", 0,
 168  169              "directio", "ufs", KSTAT_TYPE_NAMED,
 169  170              sizeof (ufs_directio_kstats) / sizeof (kstat_named_t),
 170  171              KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
 171  172          if (ufs_directio_kstatsp) {
 172  173                  ufs_directio_kstatsp->ks_data = (void *)&ufs_directio_kstats;
 173  174                  kstat_install(ufs_directio_kstatsp);
 174  175          }
 175  176          /*
 176  177           * kzero is broken so we have to use a private buf of zeroes
 177  178           */
 178  179          ufs_directio_zero_buf = kmem_zalloc(ufs_directio_zero_len, KM_SLEEP);
 179  180          directio_bufs_init();
 180  181  }
 181  182  
 182  183  /*
 183  184   * Wait for the first direct IO operation to finish
 184  185   */
 185  186  static int
 186  187  directio_wait_one(struct directio_buf *dbp, long *bytes_iop)
 187  188  {
 188  189          buf_t   *bp;
 189  190          int     error;
 190  191  
 191  192          /*
 192  193           * Wait for IO to finish
 193  194           */
 194  195          bp = &dbp->buf;
 195  196          error = biowait(bp);
 196  197  
 197  198          /*
 198  199           * bytes_io will be used to figure out a resid
 199  200           * for the caller. The resid is approximated by reporting
 200  201           * the bytes following the first failed IO as the residual.
 201  202           *
 202  203           * I am cautious about using b_resid because I
 203  204           * am not sure how well the disk drivers maintain it.
 204  205           */
 205  206          if (error)
 206  207                  if (bp->b_resid)
 207  208                          *bytes_iop = bp->b_bcount - bp->b_resid;
 208  209                  else
 209  210                          *bytes_iop = 0;
 210  211          else
 211  212                  *bytes_iop += bp->b_bcount;
 212  213          /*
 213  214           * Release direct IO resources

↓ open down ↓

180 lines elided

↑ open up ↑

 214  215           */
 215  216          bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
 216  217          kmem_cache_free(directio_buf_cache, dbp);
 217  218          return (error);
 218  219  }
 219  220  
 220  221  /*
 221  222   * Wait for all of the direct IO operations to finish
 222  223   */
 223  224  
 224      -uint32_t        ufs_directio_drop_kpri = 0;     /* enable kpri hack */
 225      -
 226  225  static int
 227  226  directio_wait(struct directio_buf *tail, long *bytes_iop)
 228  227  {
 229  228          int     error = 0, newerror;
 230  229          struct directio_buf     *dbp;
 231      -        uint_t  kpri_req_save;
 232  230  
 233  231          /*
 234  232           * The linked list of directio buf structures is maintained
 235  233           * in reverse order (tail->last request->penultimate request->...)
 236  234           */
 237      -        /*
 238      -         * This is the k_pri_req hack. Large numbers of threads
 239      -         * sleeping with kernel priority will cause scheduler thrashing
 240      -         * on an MP machine. This can be seen running Oracle using
 241      -         * directio to ufs files. Sleep at normal priority here to
 242      -         * more closely mimic physio to a device partition. This
 243      -         * workaround is disabled by default as a niced thread could
 244      -         * be starved from running while holding i_rwlock and i_contents.
 245      -         */
 246      -        if (ufs_directio_drop_kpri) {
 247      -                kpri_req_save = curthread->t_kpri_req;
 248      -                curthread->t_kpri_req = 0;
 249      -        }
 250  235          while ((dbp = tail) != NULL) {
 251  236                  tail = dbp->next;
 252  237                  newerror = directio_wait_one(dbp, bytes_iop);
 253  238                  if (error == 0)
 254  239                          error = newerror;
 255  240          }
 256      -        if (ufs_directio_drop_kpri)
 257      -                curthread->t_kpri_req = kpri_req_save;
 258  241          return (error);
 259  242  }
 260  243  /*
 261  244   * Initiate direct IO request
 262  245   */
 263  246  static void
 264  247  directio_start(struct ufsvfs *ufsvfsp, struct inode *ip, size_t nbytes,
 265      -        offset_t offset, char *addr, enum seg_rw rw, struct proc *procp,
 266      -        struct directio_buf **tailp, page_t **pplist)
      248 +    offset_t offset, char *addr, enum seg_rw rw, struct proc *procp,
      249 +    struct directio_buf **tailp, page_t **pplist)
 267  250  {
 268  251          buf_t *bp;
 269  252          struct directio_buf *dbp;
 270  253  
 271  254          /*
 272  255           * Allocate a directio buf header
 273  256           *   Note - list is maintained in reverse order.
 274  257           *   directio_wait_one() depends on this fact when
 275  258           *   adjusting the ``bytes_io'' param. bytes_io
 276  259           *   is used to compute a residual in the case of error.

 277  260           */
 278  261          dbp = kmem_cache_alloc(directio_buf_cache, KM_SLEEP);
 279  262          dbp->next = *tailp;
 280  263          *tailp = dbp;
 281  264  
 282  265          /*
 283  266           * Initialize buf header
 284  267           */
 285  268          dbp->addr = addr;
 286  269          dbp->nbytes = nbytes;
 287  270          bp = &dbp->buf;
 288  271          bp->b_edev = ip->i_dev;
 289  272          bp->b_lblkno = btodt(offset);
 290  273          bp->b_bcount = nbytes;
 291  274          bp->b_un.b_addr = addr;
 292  275          bp->b_proc = procp;
 293  276          bp->b_file = ip->i_vnode;
 294  277  
 295  278          /*
 296  279           * Note that S_WRITE implies B_READ and vice versa: a read(2)
 297  280           * will B_READ data from the filesystem and S_WRITE it into
 298  281           * the user's buffer; a write(2) will S_READ data from the
 299  282           * user's buffer and B_WRITE it to the filesystem.
 300  283           */
 301  284          if (rw == S_WRITE) {
 302  285                  bp->b_flags = B_BUSY | B_PHYS | B_READ;
 303  286                  ufs_directio_kstats.phys_reads.value.ui64++;
 304  287                  ufs_directio_kstats.nread.value.ui64 += nbytes;
 305  288          } else {
 306  289                  bp->b_flags = B_BUSY | B_PHYS | B_WRITE;
 307  290                  ufs_directio_kstats.phys_writes.value.ui64++;
 308  291                  ufs_directio_kstats.nwritten.value.ui64 += nbytes;
 309  292          }
 310  293          bp->b_shadow = pplist;
 311  294          if (pplist != NULL)
 312  295                  bp->b_flags |= B_SHADOW;
 313  296  
 314  297          /*
 315  298           * Issue I/O request.
 316  299           */
 317  300          ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
 318  301          if (ufsvfsp->vfs_snapshot)
 319  302                  fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
 320  303          else
 321  304                  (void) bdev_strategy(bp);
 322  305  
 323  306          if (rw == S_WRITE)
 324  307                  lwp_stat_update(LWP_STAT_OUBLK, 1);
 325  308          else
 326  309                  lwp_stat_update(LWP_STAT_INBLK, 1);
 327  310  
 328  311  }
 329  312  
 330  313  uint32_t        ufs_shared_writes;      /* writes done w/ lock shared */
 331  314  uint32_t        ufs_cur_writes;         /* # concurrent writes */
 332  315  uint32_t        ufs_maxcur_writes;      /* high water concurrent writes */
 333  316  uint32_t        ufs_posix_hits;         /* writes done /w lock excl. */
 334  317  
 335  318  /*

↓ open down ↓

59 lines elided

↑ open up ↑

 336  319   * Force POSIX syncronous data integrity on all writes for testing.
 337  320   */
 338  321  uint32_t        ufs_force_posix_sdi = 0;
 339  322  
 340  323  /*
 341  324   * Direct Write
 342  325   */
 343  326  
 344  327  int
 345  328  ufs_directio_write(struct inode *ip, uio_t *arg_uio, int ioflag, int rewrite,
 346      -        cred_t *cr, int *statusp)
      329 +    cred_t *cr, int *statusp)
 347  330  {
 348  331          long            resid, bytes_written;
 349  332          u_offset_t      size, uoff;
 350  333          uio_t           *uio = arg_uio;
 351  334          rlim64_t        limit = uio->uio_llimit;
 352  335          int             on, n, error, newerror, len, has_holes;
 353  336          daddr_t         bn;
 354  337          size_t          nbytes;
 355  338          struct fs       *fs;
 356  339          vnode_t         *vp;

 357  340          iovec_t         *iov;
 358  341          struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
 359  342          struct proc     *procp;
 360  343          struct as       *as;
 361  344          struct directio_buf     *tail;
 362  345          int             exclusive, ncur, bmap_peek;
 363  346          uio_t           copy_uio;
 364  347          iovec_t         copy_iov;
 365  348          char            *copy_base;
 366  349          long            copy_resid;
 367  350  
 368  351          /*
 369  352           * assume that directio isn't possible (normal case)
 370  353           */
 371  354          *statusp = DIRECTIO_FAILURE;
 372  355  
 373  356          /*
 374  357           * Don't go direct
 375  358           */
 376  359          if (ufs_directio_enabled == 0)
 377  360                  return (0);
 378  361  
 379  362          /*
 380  363           * mapped file; nevermind
 381  364           */
 382  365          if (ip->i_mapcnt)
 383  366                  return (0);
 384  367  
 385  368          /*
 386  369           * CAN WE DO DIRECT IO?
 387  370           */
 388  371          uoff = uio->uio_loffset;
 389  372          resid = uio->uio_resid;
 390  373  
 391  374          /*
 392  375           * beyond limit
 393  376           */
 394  377          if (uoff + resid > limit)
 395  378                  return (0);
 396  379  
 397  380          /*
 398  381           * must be sector aligned
 399  382           */
 400  383          if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
 401  384                  return (0);
 402  385  
 403  386          /*
 404  387           * SHOULD WE DO DIRECT IO?
 405  388           */
 406  389          size = ip->i_size;

↓ open down ↓

50 lines elided

↑ open up ↑

 407  390          has_holes = -1;
 408  391  
 409  392          /*
 410  393           * only on regular files; no metadata
 411  394           */
 412  395          if (((ip->i_mode & IFMT) != IFREG) || ip->i_ufsvfs->vfs_qinod == ip)
 413  396                  return (0);
 414  397  
 415  398          /*
 416  399           * Synchronous, allocating writes run very slow in Direct-Mode
 417      -         *      XXX - can be fixed with bmap_write changes for large writes!!!
      400 +         *      XXX - can be fixed with bmap_write changes for large writes!!!
 418  401           *      XXX - can be fixed for updates to "almost-full" files
 419  402           *      XXX - WARNING - system hangs if bmap_write() has to
 420      -         *                      allocate lots of pages since pageout
 421      -         *                      suspends on locked inode
      403 +         *                      allocate lots of pages since pageout
      404 +         *                      suspends on locked inode
 422  405           */
 423  406          if (!rewrite && (ip->i_flag & ISYNC)) {
 424  407                  if ((uoff + resid) > size)
 425  408                          return (0);
 426  409                  has_holes = bmap_has_holes(ip);
 427  410                  if (has_holes)
 428  411                          return (0);
 429  412          }
 430  413  
 431  414          /*

 432  415           * Each iovec must be short aligned and sector aligned.  If
 433  416           * one is not, then kmem_alloc a new buffer and copy all of
 434  417           * the smaller buffers into the new buffer.  This new
 435  418           * buffer will be short aligned and sector aligned.
 436  419           */
 437  420          iov = uio->uio_iov;
 438  421          nbytes = uio->uio_iovcnt;
 439  422          while (nbytes--) {
 440  423                  if (((uint_t)iov->iov_len & (DEV_BSIZE - 1)) != 0 ||
 441  424                      (intptr_t)(iov->iov_base) & 1) {
 442  425                          copy_resid = uio->uio_resid;
 443  426                          copy_base = kmem_alloc(copy_resid, KM_NOSLEEP);
 444  427                          if (copy_base == NULL)
 445  428                                  return (0);
 446  429                          copy_iov.iov_base = copy_base;
 447  430                          copy_iov.iov_len = copy_resid;
 448  431                          copy_uio.uio_iov = &copy_iov;
 449  432                          copy_uio.uio_iovcnt = 1;
 450  433                          copy_uio.uio_segflg = UIO_SYSSPACE;
 451  434                          copy_uio.uio_extflg = UIO_COPY_DEFAULT;
 452  435                          copy_uio.uio_loffset = uio->uio_loffset;
 453  436                          copy_uio.uio_resid = uio->uio_resid;
 454  437                          copy_uio.uio_llimit = uio->uio_llimit;
 455  438                          error = uiomove(copy_base, copy_resid, UIO_WRITE, uio);
 456  439                          if (error) {
 457  440                                  kmem_free(copy_base, copy_resid);
 458  441                                  return (0);
 459  442                          }
 460  443                          uio = &copy_uio;
 461  444                          break;
 462  445                  }
 463  446                  iov++;
 464  447          }
 465  448  
 466  449          /*
 467  450           * From here on down, all error exits must go to errout and
 468  451           * not simply return a 0.
 469  452           */
 470  453  
 471  454          /*
 472  455           * DIRECTIO
 473  456           */
 474  457  
 475  458          fs = ip->i_fs;
 476  459  
 477  460          /*
 478  461           * POSIX check. If attempting a concurrent re-write, make sure
 479  462           * that this will be a single request to the driver to meet
 480  463           * POSIX synchronous data integrity requirements.
 481  464           */
 482  465          bmap_peek = 0;
 483  466          if (rewrite && ((ioflag & FDSYNC) || ufs_force_posix_sdi)) {
 484  467                  int upgrade = 0;
 485  468  
 486  469                  /* check easy conditions first */
 487  470                  if (uio->uio_iovcnt != 1 || resid > ufsvfsp->vfs_ioclustsz) {
 488  471                          upgrade = 1;
 489  472                  } else {
 490  473                          /* now look for contiguous allocation */
 491  474                          len = (ssize_t)blkroundup(fs, resid);
 492  475                          error = bmap_read(ip, uoff, &bn, &len);
 493  476                          if (error || bn == UFS_HOLE || len == 0)
 494  477                                  goto errout;
 495  478                          /* save a call to bmap_read later */
 496  479                          bmap_peek = 1;
 497  480                          if (len < resid)
 498  481                                  upgrade = 1;
 499  482                  }
 500  483                  if (upgrade) {
 501  484                          rw_exit(&ip->i_contents);
 502  485                          rw_enter(&ip->i_contents, RW_WRITER);
 503  486                          ufs_posix_hits++;
 504  487                  }
 505  488          }
 506  489  
 507  490  
 508  491          /*
 509  492           * allocate space
 510  493           */
 511  494  
 512  495          /*
 513  496           * If attempting a re-write, there is no allocation to do.
 514  497           * bmap_write would trip an ASSERT if i_contents is held shared.
 515  498           */
 516  499          if (rewrite)
 517  500                  goto skip_alloc;
 518  501  
 519  502          do {
 520  503                  on = (int)blkoff(fs, uoff);
 521  504                  n = (int)MIN(fs->fs_bsize - on, resid);
 522  505                  if ((uoff + n) > ip->i_size) {
 523  506                          error = bmap_write(ip, uoff, (int)(on + n),
 524  507                              (int)(uoff & (offset_t)MAXBOFFSET) == 0,
 525  508                              NULL, cr);
 526  509                          /* Caller is responsible for updating i_seq if needed */
 527  510                          if (error)
 528  511                                  break;
 529  512                          ip->i_size = uoff + n;
 530  513                          ip->i_flag |= IATTCHG;
 531  514                  } else if (n == MAXBSIZE) {
 532  515                          error = bmap_write(ip, uoff, (int)(on + n),
 533  516                              BI_ALLOC_ONLY, NULL, cr);
 534  517                          /* Caller is responsible for updating i_seq if needed */
 535  518                  } else {
 536  519                          if (has_holes < 0)
 537  520                                  has_holes = bmap_has_holes(ip);
 538  521                          if (has_holes) {
 539  522                                  uint_t  blk_size;
 540  523                                  u_offset_t offset;
 541  524  
 542  525                                  offset = uoff & (offset_t)fs->fs_bmask;
 543  526                                  blk_size = (int)blksize(fs, ip,
 544  527                                      (daddr_t)lblkno(fs, offset));
 545  528                                  error = bmap_write(ip, uoff, blk_size,
 546  529                                      BI_NORMAL, NULL, cr);
 547  530                                  /*
 548  531                                   * Caller is responsible for updating
 549  532                                   * i_seq if needed
 550  533                                   */
 551  534                          } else
 552  535                                  error = 0;
 553  536                  }
 554  537                  if (error)
 555  538                          break;
 556  539                  uoff += n;
 557  540                  resid -= n;
 558  541                  /*
 559  542                   * if file has grown larger than 2GB, set flag
 560  543                   * in superblock if not already set
 561  544                   */
 562  545                  if ((ip->i_size > MAXOFF32_T) &&
 563  546                      !(fs->fs_flags & FSLARGEFILES)) {
 564  547                          ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
 565  548                          mutex_enter(&ufsvfsp->vfs_lock);
 566  549                          fs->fs_flags |= FSLARGEFILES;
 567  550                          ufs_sbwrite(ufsvfsp);
 568  551                          mutex_exit(&ufsvfsp->vfs_lock);
 569  552                  }
 570  553          } while (resid);
 571  554  
 572  555          if (error) {
 573  556                  /*
 574  557                   * restore original state
 575  558                   */
 576  559                  if (resid) {
 577  560                          if (size == ip->i_size)
 578  561                                  goto errout;
 579  562                          (void) ufs_itrunc(ip, size, 0, cr);
 580  563                  }
 581  564                  /*
 582  565                   * try non-directio path
 583  566                   */
 584  567                  goto errout;
 585  568          }
 586  569  skip_alloc:
 587  570  
 588  571          /*
 589  572           * get rid of cached pages
 590  573           */
 591  574          vp = ITOV(ip);
 592  575          exclusive = rw_write_held(&ip->i_contents);
 593  576          if (vn_has_cached_data(vp)) {
 594  577                  if (!exclusive) {
 595  578                          /*
 596  579                           * Still holding i_rwlock, so no allocations
 597  580                           * can happen after dropping contents.
 598  581                           */
 599  582                          rw_exit(&ip->i_contents);
 600  583                          rw_enter(&ip->i_contents, RW_WRITER);
 601  584                  }
 602  585                  (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
 603  586                      B_INVAL, cr, NULL);
 604  587                  if (vn_has_cached_data(vp))
 605  588                          goto errout;
 606  589                  if (!exclusive)
 607  590                          rw_downgrade(&ip->i_contents);
 608  591                  ufs_directio_kstats.nflushes.value.ui64++;
 609  592          }
 610  593  
 611  594          /*
 612  595           * Direct Writes
 613  596           */
 614  597  
 615  598          if (!exclusive) {
 616  599                  ufs_shared_writes++;
 617  600                  ncur = atomic_inc_32_nv(&ufs_cur_writes);
 618  601                  if (ncur > ufs_maxcur_writes)
 619  602                          ufs_maxcur_writes = ncur;
 620  603          }
 621  604  
 622  605          /*
 623  606           * proc and as are for VM operations in directio_start()
 624  607           */
 625  608          if (uio->uio_segflg == UIO_USERSPACE) {
 626  609                  procp = ttoproc(curthread);
 627  610                  as = procp->p_as;
 628  611          } else {
 629  612                  procp = NULL;
 630  613                  as = &kas;
 631  614          }
 632  615          *statusp = DIRECTIO_SUCCESS;
 633  616          error = 0;
 634  617          newerror = 0;
 635  618          resid = uio->uio_resid;
 636  619          bytes_written = 0;
 637  620          ufs_directio_kstats.logical_writes.value.ui64++;
 638  621          while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
 639  622                  size_t pglck_len, pglck_size;
 640  623                  caddr_t pglck_base;
 641  624                  page_t **pplist, **spplist;
 642  625  
 643  626                  tail = NULL;
 644  627  
 645  628                  /*
 646  629                   * Adjust number of bytes
 647  630                   */
 648  631                  iov = uio->uio_iov;
 649  632                  pglck_len = (size_t)MIN(iov->iov_len, resid);
 650  633                  pglck_base = iov->iov_base;
 651  634                  if (pglck_len == 0) {
 652  635                          uio->uio_iov++;
 653  636                          uio->uio_iovcnt--;
 654  637                          continue;
 655  638                  }
 656  639  
 657  640                  /*
 658  641                   * Try to Lock down the largest chunck of pages possible.
 659  642                   */
 660  643                  pglck_len = (size_t)MIN(pglck_len,  ufsvfsp->vfs_ioclustsz);
 661  644                  error = as_pagelock(as, &pplist, pglck_base, pglck_len, S_READ);
 662  645  
 663  646                  if (error)
 664  647                          break;
 665  648  
 666  649                  pglck_size = pglck_len;
 667  650                  while (pglck_len) {
 668  651  
 669  652                          nbytes = pglck_len;
 670  653                          uoff = uio->uio_loffset;
 671  654  
 672  655                          if (!bmap_peek) {
 673  656  
 674  657                                  /*
 675  658                                   * Re-adjust number of bytes to contiguous
 676  659                                   * range. May have already called bmap_read
 677  660                                   * in the case of a concurrent rewrite.
 678  661                                   */
 679  662                                  len = (ssize_t)blkroundup(fs, nbytes);
 680  663                                  error = bmap_read(ip, uoff, &bn, &len);
 681  664                                  if (error)
 682  665                                          break;
 683  666                                  if (bn == UFS_HOLE || len == 0)
 684  667                                          break;
 685  668                          }
 686  669                          nbytes = (size_t)MIN(nbytes, len);
 687  670                          bmap_peek = 0;
 688  671  
 689  672                          /*
 690  673                           * Get the pagelist pointer for this offset to be
 691  674                           * passed to directio_start.
 692  675                           */
 693  676  
 694  677                          if (pplist != NULL)
 695  678                                  spplist = pplist +
 696  679                                      btop((uintptr_t)iov->iov_base -
 697  680                                      ((uintptr_t)pglck_base & PAGEMASK));
 698  681                          else
 699  682                                  spplist = NULL;
 700  683  
 701  684                          /*
 702  685                           * Kick off the direct write requests
 703  686                           */
 704  687                          directio_start(ufsvfsp, ip, nbytes, ldbtob(bn),
 705  688                              iov->iov_base, S_READ, procp, &tail, spplist);
 706  689  
 707  690                          /*
 708  691                           * Adjust pointers and counters
 709  692                           */
 710  693                          iov->iov_len -= nbytes;
 711  694                          iov->iov_base += nbytes;
 712  695                          uio->uio_loffset += nbytes;
 713  696                          resid -= nbytes;
 714  697                          pglck_len -= nbytes;
 715  698                  }
 716  699  
 717  700                  /*
 718  701                   * Wait for outstanding requests
 719  702                   */
 720  703                  newerror = directio_wait(tail, &bytes_written);
 721  704  
 722  705                  /*
 723  706                   * Release VM resources
 724  707                   */
 725  708                  as_pageunlock(as, pplist, pglck_base, pglck_size, S_READ);
 726  709  
 727  710          }
 728  711  
 729  712          if (!exclusive) {
 730  713                  atomic_dec_32(&ufs_cur_writes);
 731  714                  /*
 732  715                   * If this write was done shared, readers may
 733  716                   * have pulled in unmodified pages. Get rid of
 734  717                   * these potentially stale pages.
 735  718                   */
 736  719                  if (vn_has_cached_data(vp)) {
 737  720                          rw_exit(&ip->i_contents);
 738  721                          rw_enter(&ip->i_contents, RW_WRITER);
 739  722                          (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
 740  723                              B_INVAL, cr, NULL);
 741  724                          ufs_directio_kstats.nflushes.value.ui64++;
 742  725                          rw_downgrade(&ip->i_contents);
 743  726                  }
 744  727          }
 745  728  
 746  729          /*
 747  730           * If error, adjust resid to begin at the first
 748  731           * un-writable byte.
 749  732           */
 750  733          if (error == 0)
 751  734                  error = newerror;
 752  735          if (error)
 753  736                  resid = uio->uio_resid - bytes_written;
 754  737          arg_uio->uio_resid = resid;
 755  738  
 756  739          if (!rewrite) {
 757  740                  ip->i_flag |= IUPD | ICHG;
 758  741                  /* Caller will update i_seq */
 759  742                  TRANS_INODE(ip->i_ufsvfs, ip);
 760  743          }
 761  744          /*
 762  745           * If there is a residual; adjust the EOF if necessary
 763  746           */
 764  747          if (resid) {
 765  748                  if (size != ip->i_size) {
 766  749                          if (uio->uio_loffset > size)
 767  750                                  size = uio->uio_loffset;
 768  751                          (void) ufs_itrunc(ip, size, 0, cr);
 769  752                  }
 770  753          }
 771  754  
 772  755          if (uio == &copy_uio)
 773  756                  kmem_free(copy_base, copy_resid);
 774  757  
 775  758          return (error);
 776  759  
 777  760  errout:
 778  761          if (uio == &copy_uio)
 779  762                  kmem_free(copy_base, copy_resid);
 780  763  
 781  764          return (0);
 782  765  }
 783  766  /*
 784  767   * Direct read of a hole
 785  768   */
 786  769  static int
 787  770  directio_hole(struct uio *uio, size_t nbytes)
 788  771  {
 789  772          int             error = 0, nzero;
 790  773          uio_t           phys_uio;
 791  774          iovec_t         phys_iov;
 792  775  
 793  776          ufs_directio_kstats.hole_reads.value.ui64++;
 794  777          ufs_directio_kstats.nread.value.ui64 += nbytes;
 795  778  
 796  779          phys_iov.iov_base = uio->uio_iov->iov_base;
 797  780          phys_iov.iov_len = nbytes;
 798  781  
 799  782          phys_uio.uio_iov = &phys_iov;
 800  783          phys_uio.uio_iovcnt = 1;
 801  784          phys_uio.uio_resid = phys_iov.iov_len;
 802  785          phys_uio.uio_segflg = uio->uio_segflg;
 803  786          phys_uio.uio_extflg = uio->uio_extflg;
 804  787          while (error == 0 && phys_uio.uio_resid) {
 805  788                  nzero = (int)MIN(phys_iov.iov_len, ufs_directio_zero_len);
 806  789                  error = uiomove(ufs_directio_zero_buf, nzero, UIO_READ,
 807  790                      &phys_uio);
 808  791          }
 809  792          return (error);
 810  793  }
 811  794  
 812  795  /*
 813  796   * Direct Read
 814  797   */
 815  798  int
 816  799  ufs_directio_read(struct inode *ip, uio_t *uio, cred_t *cr, int *statusp)
 817  800  {
 818  801          ssize_t         resid, bytes_read;
 819  802          u_offset_t      size, uoff;
 820  803          int             error, newerror, len;
 821  804          size_t          nbytes;
 822  805          struct fs       *fs;
 823  806          vnode_t         *vp;
 824  807          daddr_t         bn;
 825  808          iovec_t         *iov;
 826  809          struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
 827  810          struct proc     *procp;
 828  811          struct as       *as;
 829  812          struct directio_buf     *tail;
 830  813  
 831  814          /*
 832  815           * assume that directio isn't possible (normal case)
 833  816           */
 834  817          *statusp = DIRECTIO_FAILURE;
 835  818  
 836  819          /*
 837  820           * Don't go direct
 838  821           */
 839  822          if (ufs_directio_enabled == 0)
 840  823                  return (0);
 841  824  
 842  825          /*
 843  826           * mapped file; nevermind
 844  827           */
 845  828          if (ip->i_mapcnt)
 846  829                  return (0);
 847  830  
 848  831          /*
 849  832           * CAN WE DO DIRECT IO?
 850  833           */
 851  834          /*
 852  835           * must be sector aligned
 853  836           */
 854  837          uoff = uio->uio_loffset;
 855  838          resid = uio->uio_resid;
 856  839          if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
 857  840                  return (0);
 858  841          /*
 859  842           * must be short aligned and sector aligned
 860  843           */
 861  844          iov = uio->uio_iov;
 862  845          nbytes = uio->uio_iovcnt;
 863  846          while (nbytes--) {
 864  847                  if (((size_t)iov->iov_len & (DEV_BSIZE - 1)) != 0)
 865  848                          return (0);
 866  849                  if ((intptr_t)(iov++->iov_base) & 1)
 867  850                          return (0);
 868  851          }
 869  852  
 870  853          /*
 871  854           * DIRECTIO
 872  855           */
 873  856          fs = ip->i_fs;
 874  857  
 875  858          /*
 876  859           * don't read past EOF
 877  860           */
 878  861          size = ip->i_size;
 879  862  
 880  863          /*
 881  864           * The file offset is past EOF so bail out here; we don't want
 882  865           * to update uio_resid and make it look like we read something.
 883  866           * We say that direct I/O was a success to avoid having rdip()
 884  867           * go through the same "read past EOF logic".
 885  868           */
 886  869          if (uoff >= size) {
 887  870                  *statusp = DIRECTIO_SUCCESS;
 888  871                  return (0);
 889  872          }
 890  873  
 891  874          /*
 892  875           * The read would extend past EOF so make it smaller.
 893  876           */
 894  877          if ((uoff + resid) > size) {
 895  878                  resid = size - uoff;
 896  879                  /*
 897  880                   * recheck sector alignment
 898  881                   */
 899  882                  if (resid & (DEV_BSIZE - 1))
 900  883                          return (0);
 901  884          }
 902  885  
 903  886          /*
 904  887           * At this point, we know there is some real work to do.
 905  888           */
 906  889          ASSERT(resid);
 907  890  
 908  891          /*
 909  892           * get rid of cached pages
 910  893           */
 911  894          vp = ITOV(ip);
 912  895          if (vn_has_cached_data(vp)) {
 913  896                  rw_exit(&ip->i_contents);
 914  897                  rw_enter(&ip->i_contents, RW_WRITER);
 915  898                  (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
 916  899                      B_INVAL, cr, NULL);
 917  900                  if (vn_has_cached_data(vp))
 918  901                          return (0);
 919  902                  rw_downgrade(&ip->i_contents);
 920  903                  ufs_directio_kstats.nflushes.value.ui64++;
 921  904          }
 922  905          /*
 923  906           * Direct Reads
 924  907           */
 925  908  
 926  909          /*
 927  910           * proc and as are for VM operations in directio_start()
 928  911           */
 929  912          if (uio->uio_segflg == UIO_USERSPACE) {
 930  913                  procp = ttoproc(curthread);
 931  914                  as = procp->p_as;
 932  915          } else {
 933  916                  procp = NULL;
 934  917                  as = &kas;
 935  918          }
 936  919  
 937  920          *statusp = DIRECTIO_SUCCESS;
 938  921          error = 0;
 939  922          newerror = 0;
 940  923          bytes_read = 0;
 941  924          ufs_directio_kstats.logical_reads.value.ui64++;
 942  925          while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
 943  926                  size_t pglck_len, pglck_size;
 944  927                  caddr_t pglck_base;
 945  928                  page_t **pplist, **spplist;
 946  929  
 947  930                  tail = NULL;
 948  931  
 949  932                  /*
 950  933                   * Adjust number of bytes
 951  934                   */
 952  935                  iov = uio->uio_iov;
 953  936                  pglck_len = (size_t)MIN(iov->iov_len, resid);
 954  937                  pglck_base = iov->iov_base;
 955  938                  if (pglck_len == 0) {
 956  939                          uio->uio_iov++;
 957  940                          uio->uio_iovcnt--;
 958  941                          continue;
 959  942                  }
 960  943  
 961  944                  /*
 962  945                   * Try to Lock down the largest chunck of pages possible.
 963  946                   */
 964  947                  pglck_len = (size_t)MIN(pglck_len,  ufsvfsp->vfs_ioclustsz);
 965  948                  error = as_pagelock(as, &pplist, pglck_base,
 966  949                      pglck_len, S_WRITE);
 967  950  
 968  951                  if (error)
 969  952                          break;
 970  953  
 971  954                  pglck_size = pglck_len;
 972  955                  while (pglck_len) {
 973  956  
 974  957                          nbytes = pglck_len;
 975  958                          uoff = uio->uio_loffset;
 976  959  
 977  960                          /*
 978  961                           * Re-adjust number of bytes to contiguous range
 979  962                           */
 980  963                          len = (ssize_t)blkroundup(fs, nbytes);
 981  964                          error = bmap_read(ip, uoff, &bn, &len);
 982  965                          if (error)
 983  966                                  break;
 984  967  
 985  968                          if (bn == UFS_HOLE) {
 986  969                                  nbytes = (size_t)MIN(fs->fs_bsize -
 987  970                                      (long)blkoff(fs, uoff), nbytes);
 988  971                                  error = directio_hole(uio, nbytes);
 989  972                                  /*
 990  973                                   * Hole reads are not added to the list
 991  974                                   * processed by directio_wait() below so
 992  975                                   * account for bytes read here.
 993  976                                   */
 994  977                                  if (!error)
 995  978                                          bytes_read += nbytes;
 996  979                          } else {
 997  980                                  nbytes = (size_t)MIN(nbytes, len);
 998  981  
 999  982                                  /*
1000  983                                   * Get the pagelist pointer for this offset
1001  984                                   * to be passed to directio_start.
1002  985                                   */
1003  986                                  if (pplist != NULL)
1004  987                                          spplist = pplist +
1005  988                                              btop((uintptr_t)iov->iov_base -
1006  989                                              ((uintptr_t)pglck_base & PAGEMASK));
1007  990                                  else
1008  991                                          spplist = NULL;
1009  992  
1010  993                                  /*
1011  994                                   * Kick off the direct read requests
1012  995                                   */
1013  996                                  directio_start(ufsvfsp, ip, nbytes,
1014  997                                      ldbtob(bn), iov->iov_base,
1015  998                                      S_WRITE, procp, &tail, spplist);
1016  999                          }
1017 1000  
1018 1001                          if (error)
1019 1002                                  break;
1020 1003  
1021 1004                          /*
1022 1005                           * Adjust pointers and counters
1023 1006                           */
1024 1007                          iov->iov_len -= nbytes;
1025 1008                          iov->iov_base += nbytes;
1026 1009                          uio->uio_loffset += nbytes;
1027 1010                          resid -= nbytes;
1028 1011                          pglck_len -= nbytes;
1029 1012                  }
1030 1013  
1031 1014                  /*
1032 1015                   * Wait for outstanding requests
1033 1016                   */
1034 1017                  newerror = directio_wait(tail, &bytes_read);
1035 1018                  /*
1036 1019                   * Release VM resources
1037 1020                   */
1038 1021                  as_pageunlock(as, pplist, pglck_base, pglck_size, S_WRITE);
1039 1022  
1040 1023          }
1041 1024  
1042 1025          /*
1043 1026           * If error, adjust resid to begin at the first
1044 1027           * un-read byte.
1045 1028           */
1046 1029          if (error == 0)
1047 1030                  error = newerror;
1048 1031          uio->uio_resid -= bytes_read;
1049 1032          return (error);
1050 1033  }

↓ open down ↓

619 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX