Print this page
    
7656 unlinking directory on tmpfs can cause kernel panic
    
      
        | Split | Close | 
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/tmpfs/tmp_dir.c
          +++ new/usr/src/uts/common/fs/tmpfs/tmp_dir.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  
    | ↓ open down ↓ | 13 lines elided | ↑ open up ↑ | 
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  23   23   * Use is subject to license terms.
       24 + *
       25 + * Copyright 2016 RackTop Systems.
  24   26   */
  25   27  
  26      -#pragma ident   "%Z%%M% %I%     %E% SMI"
  27      -
  28   28  #include <sys/types.h>
  29   29  #include <sys/param.h>
  30   30  #include <sys/sysmacros.h>
  31   31  #include <sys/systm.h>
  32   32  #include <sys/time.h>
  33   33  #include <sys/vfs.h>
  34   34  #include <sys/vnode.h>
  35   35  #include <sys/errno.h>
  36   36  #include <sys/cmn_err.h>
  37   37  #include <sys/cred.h>
  38   38  #include <sys/stat.h>
  39   39  #include <sys/debug.h>
  40   40  #include <sys/policy.h>
  41   41  #include <sys/fs/tmpnode.h>
  42   42  #include <sys/fs/tmp.h>
  43   43  #include <sys/vtrace.h>
  44   44  
  45   45  static int tdircheckpath(struct tmpnode *, struct tmpnode *, struct cred *);
  46   46  static int tdirrename(struct tmpnode *, struct tmpnode *, struct tmpnode *,
  47   47          char *, struct tmpnode *, struct tdirent *, struct cred *);
  48   48  static void tdirfixdotdot(struct tmpnode *, struct tmpnode *, struct tmpnode *);
  49   49  static int tdirmaketnode(struct tmpnode *, struct tmount *, struct vattr *,
  50   50          enum de_op, struct tmpnode **, struct cred *);
  51   51  static int tdiraddentry(struct tmpnode *, struct tmpnode *, char *,
  52   52          enum de_op, struct tmpnode *);
  53   53  
  54   54  
  55   55  #define T_HASH_SIZE     8192            /* must be power of 2 */
  56   56  #define T_MUTEX_SIZE    64
  57   57  
  58   58  static struct tdirent   *t_hashtable[T_HASH_SIZE];
  59   59  static kmutex_t          t_hashmutex[T_MUTEX_SIZE];
  60   60  
  61   61  #define T_HASH_INDEX(a)         ((a) & (T_HASH_SIZE-1))
  62   62  #define T_MUTEX_INDEX(a)        ((a) & (T_MUTEX_SIZE-1))
  63   63  
  64   64  #define TMPFS_HASH(tp, name, hash)                              \
  65   65          {                                                       \
  66   66                  char Xc, *Xcp;                                  \
  67   67                  hash = (uint_t)(uintptr_t)(tp) >> 8;            \
  68   68                  for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++)     \
  69   69                          hash = (hash << 4) + hash + (uint_t)Xc; \
  70   70          }
  71   71  
  72   72  void
  73   73  tmpfs_hash_init(void)
  74   74  {
  75   75          int     ix;
  76   76  
  77   77          for (ix = 0; ix < T_MUTEX_SIZE; ix++)
  78   78                  mutex_init(&t_hashmutex[ix], NULL, MUTEX_DEFAULT, NULL);
  79   79  }
  80   80  
  81   81  /*
  82   82   * This routine is where the rubber meets the road for identities.
  83   83   */
  84   84  static void
  85   85  tmpfs_hash_in(struct tdirent *t)
  86   86  {
  87   87          uint_t          hash;
  88   88          struct tdirent  **prevpp;
  89   89          kmutex_t        *t_hmtx;
  90   90  
  91   91          TMPFS_HASH(t->td_parent, t->td_name, hash);
  92   92          t->td_hash = hash;
  93   93          prevpp = &t_hashtable[T_HASH_INDEX(hash)];
  94   94          t_hmtx = &t_hashmutex[T_MUTEX_INDEX(hash)];
  95   95          mutex_enter(t_hmtx);
  96   96          t->td_link = *prevpp;
  97   97          *prevpp = t;
  98   98          mutex_exit(t_hmtx);
  99   99  }
 100  100  
 101  101  /*
 102  102   * Remove tdirent *t from the hash list.
 103  103   */
 104  104  static void
 105  105  tmpfs_hash_out(struct tdirent *t)
 106  106  {
 107  107          uint_t          hash;
 108  108          struct tdirent  **prevpp;
 109  109          kmutex_t        *t_hmtx;
 110  110  
 111  111          hash = t->td_hash;
 112  112          prevpp = &t_hashtable[T_HASH_INDEX(hash)];
 113  113          t_hmtx = &t_hashmutex[T_MUTEX_INDEX(hash)];
 114  114          mutex_enter(t_hmtx);
 115  115          while (*prevpp != t)
 116  116                  prevpp = &(*prevpp)->td_link;
 117  117          *prevpp = t->td_link;
 118  118          mutex_exit(t_hmtx);
 119  119  }
 120  120  
 121  121  /*
 122  122   * Currently called by tdirrename() only.
 123  123   * rename operation needs to be done with lock held, to ensure that
 124  124   * no other operations can access the tmpnode at the same instance.
 125  125   */
 126  126  static void
 127  127  tmpfs_hash_change(struct tdirent *tdp, struct tmpnode *fromtp)
 128  128  {
 129  129          uint_t          hash;
 130  130          kmutex_t        *t_hmtx;
 131  131  
 132  132          hash = tdp->td_hash;
 133  133          t_hmtx = &t_hashmutex[T_MUTEX_INDEX(hash)];
 134  134          mutex_enter(t_hmtx);
 135  135          tdp->td_tmpnode = fromtp;
 136  136          mutex_exit(t_hmtx);
 137  137  }
 138  138  
 139  139  static struct tdirent *
 140  140  tmpfs_hash_lookup(char *name, struct tmpnode *parent, uint_t hold,
 141  141          struct tmpnode **found)
 142  142  {
 143  143          struct tdirent  *l;
 144  144          uint_t          hash;
 145  145          kmutex_t        *t_hmtx;
 146  146          struct tmpnode  *tnp;
 147  147  
 148  148          TMPFS_HASH(parent, name, hash);
 149  149          t_hmtx = &t_hashmutex[T_MUTEX_INDEX(hash)];
 150  150          mutex_enter(t_hmtx);
 151  151          l = t_hashtable[T_HASH_INDEX(hash)];
 152  152          while (l) {
 153  153                  if ((l->td_hash == hash) &&
 154  154                      (l->td_parent == parent) &&
 155  155                      (strcmp(l->td_name, name) == 0)) {
 156  156                          /*
 157  157                           * We need to make sure that the tmpnode that
 158  158                           * we put a hold on is the same one that we pass back.
 159  159                           * Hence, temporary variable tnp is necessary.
 160  160                           */
 161  161                          tnp = l->td_tmpnode;
 162  162                          if (hold) {
 163  163                                  ASSERT(tnp);
 164  164                                  tmpnode_hold(tnp);
 165  165                          }
 166  166                          if (found)
 167  167                                  *found = tnp;
 168  168                          mutex_exit(t_hmtx);
 169  169                          return (l);
 170  170                  } else {
 171  171                          l = l->td_link;
 172  172                  }
 173  173          }
 174  174          mutex_exit(t_hmtx);
 175  175          return (NULL);
 176  176  }
 177  177  
 178  178  /*
 179  179   * Search directory 'parent' for entry 'name'.
 180  180   *
 181  181   * The calling thread can't hold the write version
 182  182   * of the rwlock for the directory being searched
 183  183   *
 184  184   * 0 is returned on success and *foundtp points
 185  185   * to the found tmpnode with its vnode held.
 186  186   */
 187  187  int
 188  188  tdirlookup(
 189  189          struct tmpnode *parent,
 190  190          char *name,
 191  191          struct tmpnode **foundtp,
 192  192          struct cred *cred)
 193  193  {
 194  194          int error;
 195  195  
 196  196          *foundtp = NULL;
 197  197          if (parent->tn_type != VDIR)
 198  198                  return (ENOTDIR);
 199  199  
 200  200          if ((error = tmp_taccess(parent, VEXEC, cred)))
 201  201                  return (error);
 202  202  
 203  203          if (*name == '\0') {
 204  204                  tmpnode_hold(parent);
 205  205                  *foundtp = parent;
 206  206                  return (0);
 207  207          }
 208  208  
 209  209          /*
 210  210           * Search the directory for the matching name
 211  211           * We need the lock protecting the tn_dir list
 212  212           * so that it doesn't change out from underneath us.
 213  213           * tmpfs_hash_lookup() will pass back the tmpnode
 214  214           * with a hold on it.
 215  215           */
 216  216  
 217  217          if (tmpfs_hash_lookup(name, parent, 1, foundtp) != NULL) {
 218  218                  ASSERT(*foundtp);
 219  219                  return (0);
 220  220          }
 221  221  
 222  222          return (ENOENT);
 223  223  }
 224  224  
 225  225  /*
 226  226   * Enter a directory entry for 'name' and 'tp' into directory 'dir'
 227  227   *
 228  228   * Returns 0 on success.
 229  229   */
 230  230  int
 231  231  tdirenter(
 232  232          struct tmount   *tm,
 233  233          struct tmpnode  *dir,           /* target directory to make entry in */
 234  234          char            *name,          /* name of entry */
 235  235          enum de_op      op,             /* entry operation */
 236  236          struct tmpnode  *fromparent,    /* source directory if rename */
 237  237          struct tmpnode  *tp,            /* source tmpnode, if link/rename */
 238  238          struct vattr    *va,
 239  239          struct tmpnode  **tpp,          /* return tmpnode, if create/mkdir */
 240  240          struct cred     *cred,
 241  241          caller_context_t *ctp)
 242  242  {
 243  243          struct tdirent *tdp;
 244  244          struct tmpnode *found = NULL;
 245  245          int error = 0;
 246  246          char *s;
 247  247  
 248  248          /*
 249  249           * tn_rwlock is held to serialize direnter and dirdeletes
 250  250           */
 251  251          ASSERT(RW_WRITE_HELD(&dir->tn_rwlock));
 252  252          ASSERT(dir->tn_type == VDIR);
 253  253  
 254  254          /*
 255  255           * Don't allow '/' characters in pathname component
 256  256           * (thus in ufs_direnter()).
 257  257           */
 258  258          for (s = name; *s; s++)
 259  259                  if (*s == '/')
 260  260                          return (EACCES);
 261  261  
 262  262          if (name[0] == '\0')
 263  263                  panic("tdirenter: NULL name");
 264  264  
 265  265          /*
 266  266           * For link and rename lock the source entry and check the link count
 267  267           * to see if it has been removed while it was unlocked.
 268  268           */
 269  269          if (op == DE_LINK || op == DE_RENAME) {
 270  270                  if (tp != dir)
 271  271                          rw_enter(&tp->tn_rwlock, RW_WRITER);
 272  272                  mutex_enter(&tp->tn_tlock);
 273  273                  if (tp->tn_nlink == 0) {
 274  274                          mutex_exit(&tp->tn_tlock);
 275  275                          if (tp != dir)
 276  276                                  rw_exit(&tp->tn_rwlock);
 277  277                          return (ENOENT);
 278  278                  }
 279  279  
 280  280                  if (tp->tn_nlink == MAXLINK) {
 281  281                          mutex_exit(&tp->tn_tlock);
 282  282                          if (tp != dir)
 283  283                                  rw_exit(&tp->tn_rwlock);
 284  284                          return (EMLINK);
 285  285                  }
 286  286                  tp->tn_nlink++;
 287  287                  gethrestime(&tp->tn_ctime);
 288  288                  mutex_exit(&tp->tn_tlock);
 289  289                  if (tp != dir)
 290  290                          rw_exit(&tp->tn_rwlock);
 291  291          }
 292  292  
 293  293          /*
 294  294           * This might be a "dangling detached directory".
 295  295           * it could have been removed, but a reference
 296  296           * to it kept in u_cwd.  don't bother searching
 297  297           * it, and with any luck the user will get tired
 298  298           * of dealing with us and cd to some absolute
 299  299           * pathway.  *sigh*, thus in ufs, too.
 300  300           */
 301  301          if (dir->tn_nlink == 0) {
 302  302                  error = ENOENT;
 303  303                  goto out;
 304  304          }
 305  305  
 306  306          /*
 307  307           * If this is a rename of a directory and the parent is
 308  308           * different (".." must be changed), then the source
 309  309           * directory must not be in the directory hierarchy
 310  310           * above the target, as this would orphan everything
 311  311           * below the source directory.
 312  312           */
 313  313          if (op == DE_RENAME) {
 314  314                  if (tp == dir) {
 315  315                          error = EINVAL;
 316  316                          goto out;
 317  317                  }
 318  318                  if (tp->tn_type == VDIR) {
 319  319                          if ((fromparent != dir) &&
 320  320                              (error = tdircheckpath(tp, dir, cred))) {
 321  321                                  goto out;
 322  322                          }
 323  323                  }
 324  324          }
 325  325  
 326  326          /*
 327  327           * Search for the entry.  Return "found" if it exists.
 328  328           */
 329  329          tdp = tmpfs_hash_lookup(name, dir, 1, &found);
 330  330  
 331  331          if (tdp) {
 332  332                  ASSERT(found);
 333  333                  switch (op) {
 334  334                  case DE_CREATE:
 335  335                  case DE_MKDIR:
 336  336                          if (tpp) {
 337  337                                  *tpp = found;
 338  338                                  error = EEXIST;
 339  339                          } else {
 340  340                                  tmpnode_rele(found);
 341  341                          }
 342  342                          break;
 343  343  
 344  344                  case DE_RENAME:
 345  345                          error = tdirrename(fromparent, tp,
 346  346                              dir, name, found, tdp, cred);
 347  347                          if (error == 0) {
 348  348                                  if (found != NULL) {
 349  349                                          vnevent_rename_dest(TNTOV(found),
 350  350                                              TNTOV(dir), name, ctp);
 351  351                                  }
 352  352                          }
 353  353  
 354  354                          tmpnode_rele(found);
 355  355                          break;
 356  356  
 357  357                  case DE_LINK:
 358  358                          /*
 359  359                           * Can't link to an existing file.
 360  360                           */
 361  361                          error = EEXIST;
 362  362                          tmpnode_rele(found);
 363  363                          break;
 364  364                  }
 365  365          } else {
 366  366  
 367  367                  /*
 368  368                   * The entry does not exist. Check write permission in
 369  369                   * directory to see if entry can be created.
 370  370                   */
 371  371                  if (error = tmp_taccess(dir, VWRITE, cred))
 372  372                          goto out;
 373  373                  if (op == DE_CREATE || op == DE_MKDIR) {
 374  374                          /*
 375  375                           * Make new tmpnode and directory entry as required.
 376  376                           */
 377  377                          error = tdirmaketnode(dir, tm, va, op, &tp, cred);
 378  378                          if (error)
 379  379                                  goto out;
 380  380                  }
 381  381                  if (error = tdiraddentry(dir, tp, name, op, fromparent)) {
 382  382                          if (op == DE_CREATE || op == DE_MKDIR) {
 383  383                                  /*
 384  384                                   * Unmake the inode we just made.
 385  385                                   */
 386  386                                  rw_enter(&tp->tn_rwlock, RW_WRITER);
 387  387                                  if ((tp->tn_type) == VDIR) {
 388  388                                          ASSERT(tdp == NULL);
 389  389                                          /*
 390  390                                           * cleanup allocs made by tdirinit()
 391  391                                           */
 392  392                                          tdirtrunc(tp);
 393  393                                  }
 394  394                                  mutex_enter(&tp->tn_tlock);
 395  395                                  tp->tn_nlink = 0;
 396  396                                  mutex_exit(&tp->tn_tlock);
 397  397                                  gethrestime(&tp->tn_ctime);
 398  398                                  rw_exit(&tp->tn_rwlock);
 399  399                                  tmpnode_rele(tp);
 400  400                                  tp = NULL;
 401  401                          }
 402  402                  } else if (tpp) {
 403  403                          *tpp = tp;
 404  404                  } else if (op == DE_CREATE || op == DE_MKDIR) {
 405  405                          tmpnode_rele(tp);
 406  406                  }
 407  407          }
 408  408  
 409  409  out:
 410  410          if (error && (op == DE_LINK || op == DE_RENAME)) {
 411  411                  /*
 412  412                   * Undo bumped link count.
 413  413                   */
 414  414                  DECR_COUNT(&tp->tn_nlink, &tp->tn_tlock);
 415  415                  gethrestime(&tp->tn_ctime);
 416  416          }
 417  417          return (error);
 418  418  }
 419  419  
 420  420  /*
 421  421   * Delete entry tp of name "nm" from dir.
 422  422   * Free dir entry space and decrement link count on tmpnode(s).
 423  423   *
 424  424   * Return 0 on success.
 425  425   */
 426  426  int
 427  427  tdirdelete(
 428  428          struct tmpnode *dir,
 429  429          struct tmpnode *tp,
 430  430          char *nm,
 431  431          enum dr_op op,
 432  432          struct cred *cred)
 433  433  {
 434  434          struct tdirent *tpdp;
 435  435          int error;
 436  436          size_t namelen;
 437  437          struct tmpnode *tnp;
 438  438          timestruc_t now;
 439  439  
 440  440          ASSERT(RW_WRITE_HELD(&dir->tn_rwlock));
 441  441          ASSERT(RW_WRITE_HELD(&tp->tn_rwlock));
 442  442          ASSERT(dir->tn_type == VDIR);
 443  443  
 444  444          if (nm[0] == '\0')
 445  445                  panic("tdirdelete: NULL name for %p", (void *)tp);
 446  446  
 447  447          /*
 448  448           * return error when removing . and ..
 449  449           */
 450  450          if (nm[0] == '.') {
 451  451                  if (nm[1] == '\0')
 452  452                          return (EINVAL);
 453  453                  if (nm[1] == '.' && nm[2] == '\0')
 454  454                          return (EEXIST); /* thus in ufs */
 455  455          }
 456  456  
 457  457          if (error = tmp_taccess(dir, VEXEC|VWRITE, cred))
 458  458                  return (error);
 459  459  
 460  460          /*
 461  461           * If the parent directory is "sticky", then the user must
 462  462           * own the parent directory or the file in it, or else must
 463  463           * have permission to write the file.  Otherwise it may not
 464  464           * be deleted (except by privileged users).
 465  465           * Same as ufs_dirremove.
 466  466           */
 467  467          if ((error = tmp_sticky_remove_access(dir, tp, cred)) != 0)
 468  468                  return (error);
 469  469  
 470  470          if (dir->tn_dir == NULL)
 471  471                  return (ENOENT);
 472  472  
 473  473          tpdp = tmpfs_hash_lookup(nm, dir, 0, &tnp);
 474  474          if (tpdp == NULL) {
 475  475                  /*
 476  476                   * If it is gone, some other thread got here first!
 477  477                   * Return error ENOENT.
 478  478                   */
 479  479                  return (ENOENT);
 480  480          }
 481  481  
 482  482          /*
 483  483           * If the tmpnode in the tdirent changed, we were probably
 484  484           * the victim of a concurrent rename operation.  The original
 485  485           * is gone, so return that status (same as UFS).
 486  486           */
 487  487          if (tp != tnp)
 488  488                  return (ENOENT);
 489  489  
 490  490          tmpfs_hash_out(tpdp);
 491  491  
 492  492          /*
 493  493           * Take tpdp out of the directory list.
 494  494           */
 495  495          ASSERT(tpdp->td_next != tpdp);
 496  496          ASSERT(tpdp->td_prev != tpdp);
 497  497          if (tpdp->td_prev) {
 498  498                  tpdp->td_prev->td_next = tpdp->td_next;
 499  499          }
 500  500          if (tpdp->td_next) {
 501  501                  tpdp->td_next->td_prev = tpdp->td_prev;
 502  502          }
 503  503  
 504  504          /*
 505  505           * If the roving slot pointer happens to match tpdp,
 506  506           * point it at the previous dirent.
 507  507           */
 508  508          if (dir->tn_dir->td_prev == tpdp) {
 509  509                  dir->tn_dir->td_prev = tpdp->td_prev;
 510  510          }
 511  511          ASSERT(tpdp->td_next != tpdp);
 512  512          ASSERT(tpdp->td_prev != tpdp);
 513  513  
 514  514          /*
 515  515           * tpdp points to the correct directory entry
 516  516           */
 517  517          namelen = strlen(tpdp->td_name) + 1;
  
    | ↓ open down ↓ | 480 lines elided | ↑ open up ↑ | 
 518  518  
 519  519          tmp_memfree(tpdp, sizeof (struct tdirent) + namelen);
 520  520          dir->tn_size -= (sizeof (struct tdirent) + namelen);
 521  521          dir->tn_dirents--;
 522  522  
 523  523          gethrestime(&now);
 524  524          dir->tn_mtime = now;
 525  525          dir->tn_ctime = now;
 526  526          tp->tn_ctime = now;
 527  527  
      528 +        /*
      529 +         * If this is a _REMOVE (unlink) operation there may
      530 +         * be other links to the directory entry.
      531 +         */
 528  532          ASSERT(tp->tn_nlink > 0);
 529  533          DECR_COUNT(&tp->tn_nlink, &tp->tn_tlock);
 530      -        if (op == DR_RMDIR && tp->tn_type == VDIR) {
 531      -                tdirtrunc(tp);
 532      -                ASSERT(tp->tn_nlink == 0);
      534 +        if (op == DR_RMDIR || (op == DR_REMOVE && tp->tn_type == VDIR)) {
      535 +                if (tp->tn_nlink > 1) {
      536 +                        ASSERT(op == DR_REMOVE);
      537 +                } else {
      538 +                        tdirtrunc(tp);
      539 +                        ASSERT(tp->tn_nlink == 0);
      540 +                }
 533  541          }
 534  542          return (0);
 535  543  }
 536  544  
 537  545  /*
 538  546   * tdirinit is used internally to initialize a directory (dir)
 539  547   * with '.' and '..' entries without checking permissions and locking
 540  548   */
 541  549  void
 542  550  tdirinit(
 543  551          struct tmpnode *parent,         /* parent of directory to initialize */
 544  552          struct tmpnode *dir)            /* the new directory */
 545  553  {
 546  554          struct tdirent *dot, *dotdot;
 547  555          timestruc_t now;
 548  556  
 549  557          ASSERT(RW_WRITE_HELD(&parent->tn_rwlock));
 550  558          ASSERT(dir->tn_type == VDIR);
 551  559  
 552  560          dot = tmp_memalloc(sizeof (struct tdirent) + 2, TMP_MUSTHAVE);
 553  561          dotdot = tmp_memalloc(sizeof (struct tdirent) + 3, TMP_MUSTHAVE);
 554  562  
 555  563          /*
 556  564           * Initialize the entries
 557  565           */
 558  566          dot->td_tmpnode = dir;
 559  567          dot->td_offset = 0;
 560  568          dot->td_name = (char *)dot + sizeof (struct tdirent);
 561  569          dot->td_name[0] = '.';
 562  570          dot->td_parent = dir;
 563  571          tmpfs_hash_in(dot);
 564  572  
 565  573          dotdot->td_tmpnode = parent;
 566  574          dotdot->td_offset = 1;
 567  575          dotdot->td_name = (char *)dotdot + sizeof (struct tdirent);
 568  576          dotdot->td_name[0] = '.';
 569  577          dotdot->td_name[1] = '.';
 570  578          dotdot->td_parent = dir;
 571  579          tmpfs_hash_in(dotdot);
 572  580  
 573  581          /*
 574  582           * Initialize directory entry list.
 575  583           */
 576  584          dot->td_next = dotdot;
 577  585          dot->td_prev = dotdot;  /* dot's td_prev holds roving slot pointer */
 578  586          dotdot->td_next = NULL;
 579  587          dotdot->td_prev = dot;
 580  588  
 581  589          gethrestime(&now);
 582  590          dir->tn_mtime = now;
 583  591          dir->tn_ctime = now;
 584  592  
 585  593          /*
 586  594           * Link counts are special for the hidden attribute directory.
 587  595           * The only explicit reference in the name space is "." and
 588  596           * the reference through ".." is not counted on the parent
 589  597           * file. The attrdir is created as a side effect to lookup,
 590  598           * so don't change the ctime of the parent.
 591  599           * Since tdirinit is called with both dir and parent being the
 592  600           * same for the root vnode, we need to increment this before we set
 593  601           * tn_nlink = 2 below.
 594  602           */
 595  603          if (!(dir->tn_vnode->v_flag & V_XATTRDIR)) {
 596  604                  INCR_COUNT(&parent->tn_nlink, &parent->tn_tlock);
 597  605                  parent->tn_ctime = now;
 598  606          }
 599  607  
 600  608          dir->tn_dir = dot;
 601  609          dir->tn_size = 2 * sizeof (struct tdirent) + 5; /* dot and dotdot */
 602  610          dir->tn_dirents = 2;
 603  611          dir->tn_nlink = 2;
 604  612  }
 605  613  
 606  614  
 607  615  /*
  
    | ↓ open down ↓ | 65 lines elided | ↑ open up ↑ | 
 608  616   * tdirtrunc is called to remove all directory entries under this directory.
 609  617   */
 610  618  void
 611  619  tdirtrunc(struct tmpnode *dir)
 612  620  {
 613  621          struct tdirent *tdp;
 614  622          struct tmpnode *tp;
 615  623          size_t namelen;
 616  624          timestruc_t now;
 617  625          int isvattrdir, isdotdot, skip_decr;
      626 +        int lock_held;
 618  627  
 619  628          ASSERT(RW_WRITE_HELD(&dir->tn_rwlock));
 620  629          ASSERT(dir->tn_type == VDIR);
 621  630  
 622  631          isvattrdir = (dir->tn_vnode->v_flag & V_XATTRDIR) ? 1 : 0;
 623  632          for (tdp = dir->tn_dir; tdp; tdp = dir->tn_dir) {
 624  633                  ASSERT(tdp->td_next != tdp);
 625  634                  ASSERT(tdp->td_prev != tdp);
 626  635                  ASSERT(tdp->td_tmpnode);
 627  636  
 628  637                  dir->tn_dir = tdp->td_next;
 629  638                  namelen = strlen(tdp->td_name) + 1;
 630  639  
 631  640                  /*
 632  641                   * Adjust the link counts to account for this directory
 633  642                   * entry removal. Hidden attribute directories may
 634  643                   * not be empty as they may be truncated as a side-
 635  644                   * effect of removing the parent. We do hold/rele
 636  645                   * operations to free up these tmpnodes.
 637  646                   *
 638  647                   * Skip the link count adjustment for parents of
 639  648                   * attribute directories as those link counts
 640  649                   * do not include the ".." reference in the hidden
 641  650                   * directories.
 642  651                   */
 643  652                  tp = tdp->td_tmpnode;
 644  653                  isdotdot = (strcmp("..", tdp->td_name) == 0);
 645  654                  skip_decr = (isvattrdir && isdotdot);
  
    | ↓ open down ↓ | 18 lines elided | ↑ open up ↑ | 
 646  655                  if (!skip_decr) {
 647  656                          ASSERT(tp->tn_nlink > 0);
 648  657                          DECR_COUNT(&tp->tn_nlink, &tp->tn_tlock);
 649  658                  }
 650  659  
 651  660                  tmpfs_hash_out(tdp);
 652  661  
 653  662                  tmp_memfree(tdp, sizeof (struct tdirent) + namelen);
 654  663                  dir->tn_size -= (sizeof (struct tdirent) + namelen);
 655  664                  dir->tn_dirents--;
      665 +
      666 +                /*
      667 +                 * This directory entry may itself be a directory with
      668 +                 * entries and removing it may have created orphans.
      669 +                 * On a normal filesystem like UFS this wouldn't be
      670 +                 * a huge problem because fcsk can reclaim them.  For
      671 +                 * TMPFS which resides in RAM however, it means we
      672 +                 * end up leaking memory.
      673 +                 *
      674 +                 * To avoid this we also truncate child directories,
      675 +                 * but only if they have no other links to them.
      676 +                 */
      677 +                if (!isdotdot && tp->tn_type == VDIR && tp != dir) {
      678 +                        if (tp->tn_nlink > 1)
      679 +                                continue;
      680 +                        lock_held = RW_WRITE_HELD(&tp->tn_rwlock);
      681 +                        if (!lock_held)
      682 +                                rw_enter(&tp->tn_rwlock, RW_WRITER);
      683 +                        tdirtrunc(tp);
      684 +                        if (!lock_held)
      685 +                                rw_exit(&tp->tn_rwlock);
      686 +                        ASSERT(tp->tn_nlink == 0);
      687 +                }
 656  688          }
 657  689  
 658  690          gethrestime(&now);
 659  691          dir->tn_mtime = now;
 660  692          dir->tn_ctime = now;
 661  693  
 662  694          ASSERT(dir->tn_dir == NULL);
 663  695          ASSERT(dir->tn_size == 0);
 664  696          ASSERT(dir->tn_dirents == 0);
 665  697  }
 666  698  
 667  699  /*
 668  700   * Check if the source directory is in the path of the target directory.
 669  701   * The target directory is locked by the caller.
 670  702   *
 671  703   * XXX - The source and target's should be different upon entry.
 672  704   */
 673  705  static int
 674  706  tdircheckpath(
 675  707          struct tmpnode *fromtp,
 676  708          struct tmpnode  *toparent,
 677  709          struct cred     *cred)
 678  710  {
 679  711          int     error = 0;
 680  712          struct tmpnode *dir, *dotdot;
 681  713          struct tdirent *tdp;
 682  714  
 683  715          ASSERT(RW_WRITE_HELD(&toparent->tn_rwlock));
 684  716  
 685  717          tdp = tmpfs_hash_lookup("..", toparent, 1, &dotdot);
 686  718          if (tdp == NULL)
 687  719                  return (ENOENT);
 688  720  
 689  721          ASSERT(dotdot);
 690  722  
 691  723          if (dotdot == toparent) {
 692  724                  /* root of fs.  search trivially satisfied. */
 693  725                  tmpnode_rele(dotdot);
 694  726                  return (0);
 695  727          }
 696  728          for (;;) {
 697  729                  /*
 698  730                   * Return error for cases like "mv c c/d",
 699  731                   * "mv c c/d/e" and so on.
 700  732                   */
 701  733                  if (dotdot == fromtp) {
 702  734                          tmpnode_rele(dotdot);
 703  735                          error = EINVAL;
 704  736                          break;
 705  737                  }
 706  738                  dir = dotdot;
 707  739                  error = tdirlookup(dir, "..", &dotdot, cred);
 708  740                  if (error) {
 709  741                          tmpnode_rele(dir);
 710  742                          break;
 711  743                  }
 712  744                  /*
 713  745                   * We're okay if we traverse the directory tree up to
 714  746                   * the root directory and don't run into the
 715  747                   * parent directory.
 716  748                   */
 717  749                  if (dir == dotdot) {
 718  750                          tmpnode_rele(dir);
 719  751                          tmpnode_rele(dotdot);
 720  752                          break;
 721  753                  }
 722  754                  tmpnode_rele(dir);
 723  755          }
 724  756          return (error);
 725  757  }
 726  758  
 727  759  static int
 728  760  tdirrename(
 729  761          struct tmpnode *fromparent,     /* parent directory of source */
 730  762          struct tmpnode *fromtp,         /* source tmpnode */
 731  763          struct tmpnode *toparent,       /* parent directory of target */
 732  764          char *nm,                       /* entry we are trying to change */
 733  765          struct tmpnode *to,             /* target tmpnode */
 734  766          struct tdirent *where,          /* target tmpnode directory entry */
 735  767          struct cred *cred)              /* credentials */
 736  768  {
 737  769          int error = 0;
 738  770          int doingdirectory;
 739  771          timestruc_t now;
 740  772  
 741  773  #if defined(lint)
 742  774          nm = nm;
 743  775  #endif
 744  776          ASSERT(RW_WRITE_HELD(&toparent->tn_rwlock));
 745  777  
 746  778          /*
 747  779           * Short circuit rename of something to itself.
 748  780           */
 749  781          if (fromtp == to)
 750  782                  return (ESAME);         /* special KLUDGE error code */
 751  783  
 752  784          rw_enter(&fromtp->tn_rwlock, RW_READER);
 753  785          rw_enter(&to->tn_rwlock, RW_READER);
 754  786  
 755  787          /*
 756  788           * Check that everything is on the same filesystem.
 757  789           */
 758  790          if (to->tn_vnode->v_vfsp != toparent->tn_vnode->v_vfsp ||
 759  791              to->tn_vnode->v_vfsp != fromtp->tn_vnode->v_vfsp) {
 760  792                  error = EXDEV;
 761  793                  goto out;
 762  794          }
 763  795  
 764  796          /*
 765  797           * Must have write permission to rewrite target entry.
 766  798           * Check for stickyness.
 767  799           */
 768  800          if ((error = tmp_taccess(toparent, VWRITE, cred)) != 0 ||
 769  801              (error = tmp_sticky_remove_access(toparent, to, cred)) != 0)
 770  802                  goto out;
 771  803  
 772  804          /*
 773  805           * Ensure source and target are compatible (both directories
 774  806           * or both not directories).  If target is a directory it must
 775  807           * be empty and have no links to it; in addition it must not
 776  808           * be a mount point, and both the source and target must be
 777  809           * writable.
 778  810           */
 779  811          doingdirectory = (fromtp->tn_type == VDIR);
 780  812          if (to->tn_type == VDIR) {
 781  813                  if (!doingdirectory) {
 782  814                          error = EISDIR;
 783  815                          goto out;
 784  816                  }
 785  817                  /*
 786  818                   * vn_vfswlock will prevent mounts from using the directory
 787  819                   * until we are done.
 788  820                   */
 789  821                  if (vn_vfswlock(TNTOV(to))) {
 790  822                          error = EBUSY;
 791  823                          goto out;
 792  824                  }
 793  825                  if (vn_mountedvfs(TNTOV(to)) != NULL) {
 794  826                          vn_vfsunlock(TNTOV(to));
 795  827                          error = EBUSY;
 796  828                          goto out;
 797  829                  }
 798  830  
 799  831                  mutex_enter(&to->tn_tlock);
 800  832                  if (to->tn_dirents > 2 || to->tn_nlink > 2) {
 801  833                          mutex_exit(&to->tn_tlock);
 802  834                          vn_vfsunlock(TNTOV(to));
 803  835                          error = EEXIST; /* SIGH should be ENOTEMPTY */
 804  836                          /*
 805  837                           * Update atime because checking tn_dirents is
 806  838                           * logically equivalent to reading the directory
 807  839                           */
 808  840                          gethrestime(&to->tn_atime);
 809  841                          goto out;
 810  842                  }
 811  843                  mutex_exit(&to->tn_tlock);
 812  844          } else if (doingdirectory) {
 813  845                  error = ENOTDIR;
 814  846                  goto out;
 815  847          }
 816  848  
 817  849          tmpfs_hash_change(where, fromtp);
 818  850          gethrestime(&now);
 819  851          toparent->tn_mtime = now;
 820  852          toparent->tn_ctime = now;
 821  853  
 822  854          /*
 823  855           * Upgrade to write lock on "to" (i.e., the target tmpnode).
 824  856           */
 825  857          rw_exit(&to->tn_rwlock);
 826  858          rw_enter(&to->tn_rwlock, RW_WRITER);
 827  859  
 828  860          /*
 829  861           * Decrement the link count of the target tmpnode.
 830  862           */
 831  863          DECR_COUNT(&to->tn_nlink, &to->tn_tlock);
 832  864          to->tn_ctime = now;
 833  865  
 834  866          if (doingdirectory) {
 835  867                  /*
 836  868                   * The entry for "to" no longer exists so release the vfslock.
 837  869                   */
 838  870                  vn_vfsunlock(TNTOV(to));
 839  871  
 840  872                  /*
 841  873                   * Decrement the target link count and delete all entires.
 842  874                   */
 843  875                  tdirtrunc(to);
 844  876                  ASSERT(to->tn_nlink == 0);
 845  877  
 846  878                  /*
 847  879                   * Renaming a directory with the parent different
 848  880                   * requires that ".." be rewritten.  The window is
 849  881                   * still there for ".." to be inconsistent, but this
 850  882                   * is unavoidable, and a lot shorter than when it was
 851  883                   * done in a user process.
 852  884                   */
 853  885                  if (fromparent != toparent)
 854  886                          tdirfixdotdot(fromtp, fromparent, toparent);
 855  887          }
 856  888  out:
 857  889          rw_exit(&to->tn_rwlock);
 858  890          rw_exit(&fromtp->tn_rwlock);
 859  891          return (error);
 860  892  }
 861  893  
 862  894  static void
 863  895  tdirfixdotdot(
 864  896          struct tmpnode  *fromtp,        /* child directory */
 865  897          struct tmpnode  *fromparent,    /* old parent directory */
 866  898          struct tmpnode  *toparent)      /* new parent directory */
 867  899  {
 868  900          struct tdirent  *dotdot;
 869  901  
 870  902          ASSERT(RW_LOCK_HELD(&toparent->tn_rwlock));
 871  903  
 872  904          /*
 873  905           * Increment the link count in the new parent tmpnode
 874  906           */
 875  907          INCR_COUNT(&toparent->tn_nlink, &toparent->tn_tlock);
 876  908          gethrestime(&toparent->tn_ctime);
 877  909  
 878  910          dotdot = tmpfs_hash_lookup("..", fromtp, 0, NULL);
 879  911  
 880  912          ASSERT(dotdot->td_tmpnode == fromparent);
 881  913          dotdot->td_tmpnode = toparent;
 882  914  
 883  915          /*
 884  916           * Decrement the link count of the old parent tmpnode.
 885  917           * If fromparent is NULL, then this is a new directory link;
 886  918           * it has no parent, so we need not do anything.
 887  919           */
 888  920          if (fromparent != NULL) {
 889  921                  mutex_enter(&fromparent->tn_tlock);
 890  922                  if (fromparent->tn_nlink != 0) {
 891  923                          fromparent->tn_nlink--;
 892  924                          gethrestime(&fromparent->tn_ctime);
 893  925                  }
 894  926                  mutex_exit(&fromparent->tn_tlock);
 895  927          }
 896  928  }
 897  929  
 898  930  static int
 899  931  tdiraddentry(
 900  932          struct tmpnode  *dir,   /* target directory to make entry in */
 901  933          struct tmpnode  *tp,    /* new tmpnode */
 902  934          char            *name,
 903  935          enum de_op      op,
 904  936          struct tmpnode  *fromtp)
 905  937  {
 906  938          struct tdirent *tdp, *tpdp;
 907  939          size_t          namelen, alloc_size;
 908  940          timestruc_t     now;
 909  941  
 910  942          /*
 911  943           * Make sure the parent directory wasn't removed from
 912  944           * underneath the caller.
 913  945           */
 914  946          if (dir->tn_dir == NULL)
 915  947                  return (ENOENT);
 916  948  
 917  949          /*
 918  950           * Check that everything is on the same filesystem.
 919  951           */
 920  952          if (tp->tn_vnode->v_vfsp != dir->tn_vnode->v_vfsp)
 921  953                  return (EXDEV);
 922  954  
 923  955          /*
 924  956           * Allocate and initialize directory entry
 925  957           */
 926  958          namelen = strlen(name) + 1;
 927  959          alloc_size = namelen + sizeof (struct tdirent);
 928  960          tdp = tmp_memalloc(alloc_size, 0);
 929  961          if (tdp == NULL)
 930  962                  return (ENOSPC);
 931  963  
 932  964          if ((op == DE_RENAME) && (tp->tn_type == VDIR))
 933  965                  tdirfixdotdot(tp, fromtp, dir);
 934  966  
 935  967          dir->tn_size += alloc_size;
 936  968          dir->tn_dirents++;
 937  969          tdp->td_tmpnode = tp;
 938  970          tdp->td_parent = dir;
 939  971  
 940  972          /*
 941  973           * The directory entry and its name were allocated sequentially.
 942  974           */
 943  975          tdp->td_name = (char *)tdp + sizeof (struct tdirent);
 944  976          (void) strcpy(tdp->td_name, name);
 945  977  
 946  978          tmpfs_hash_in(tdp);
 947  979  
 948  980          /*
 949  981           * Some utilities expect the size of a directory to remain
 950  982           * somewhat static.  For example, a routine which unlinks
 951  983           * files between calls to readdir(); the size of the
 952  984           * directory changes from underneath it and so the real
 953  985           * directory offset in bytes is invalid.  To circumvent
 954  986           * this problem, we initialize a directory entry with an
 955  987           * phony offset, and use this offset to determine end of
 956  988           * file in tmp_readdir.
 957  989           */
 958  990          tpdp = dir->tn_dir->td_prev;
 959  991          /*
 960  992           * Install at first empty "slot" in directory list.
 961  993           */
 962  994          while (tpdp->td_next != NULL && (tpdp->td_next->td_offset -
 963  995              tpdp->td_offset) <= 1) {
 964  996                  ASSERT(tpdp->td_next != tpdp);
 965  997                  ASSERT(tpdp->td_prev != tpdp);
 966  998                  ASSERT(tpdp->td_next->td_offset > tpdp->td_offset);
 967  999                  tpdp = tpdp->td_next;
 968 1000          }
 969 1001          tdp->td_offset = tpdp->td_offset + 1;
 970 1002  
 971 1003          /*
 972 1004           * If we're at the end of the dirent list and the offset (which
 973 1005           * is necessarily the largest offset in this directory) is more
 974 1006           * than twice the number of dirents, that means the directory is
 975 1007           * 50% holes.  At this point we reset the slot pointer back to
 976 1008           * the beginning of the directory so we start using the holes.
 977 1009           * The idea is that if there are N dirents, there must also be
 978 1010           * N holes, so we can satisfy the next N creates by walking at
 979 1011           * most 2N entries; thus the average cost of a create is constant.
 980 1012           * Note that we use the first dirent's td_prev as the roving
 981 1013           * slot pointer; it's ugly, but it saves a word in every dirent.
 982 1014           */
 983 1015          if (tpdp->td_next == NULL && tpdp->td_offset > 2 * dir->tn_dirents)
 984 1016                  dir->tn_dir->td_prev = dir->tn_dir->td_next;
 985 1017          else
 986 1018                  dir->tn_dir->td_prev = tdp;
 987 1019  
 988 1020          ASSERT(tpdp->td_next != tpdp);
 989 1021          ASSERT(tpdp->td_prev != tpdp);
 990 1022  
 991 1023          tdp->td_next = tpdp->td_next;
 992 1024          if (tdp->td_next) {
 993 1025                  tdp->td_next->td_prev = tdp;
 994 1026          }
 995 1027          tdp->td_prev = tpdp;
 996 1028          tpdp->td_next = tdp;
 997 1029  
 998 1030          ASSERT(tdp->td_next != tdp);
 999 1031          ASSERT(tdp->td_prev != tdp);
1000 1032          ASSERT(tpdp->td_next != tpdp);
1001 1033          ASSERT(tpdp->td_prev != tpdp);
1002 1034  
1003 1035          gethrestime(&now);
1004 1036          dir->tn_mtime = now;
1005 1037          dir->tn_ctime = now;
1006 1038  
1007 1039          return (0);
1008 1040  }
1009 1041  
1010 1042  static int
1011 1043  tdirmaketnode(
1012 1044          struct tmpnode *dir,
1013 1045          struct tmount   *tm,
1014 1046          struct vattr    *va,
1015 1047          enum    de_op   op,
1016 1048          struct tmpnode **newnode,
1017 1049          struct cred     *cred)
1018 1050  {
1019 1051          struct tmpnode *tp;
1020 1052          enum vtype      type;
1021 1053  
1022 1054          ASSERT(va != NULL);
1023 1055          ASSERT(op == DE_CREATE || op == DE_MKDIR);
1024 1056          if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) ||
1025 1057              ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime)))
1026 1058                  return (EOVERFLOW);
1027 1059          type = va->va_type;
1028 1060          tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE);
1029 1061          tmpnode_init(tm, tp, va, cred);
1030 1062  
1031 1063          /* setup normal file/dir's extended attribute directory */
1032 1064          if (dir->tn_flags & ISXATTR) {
1033 1065                  /* parent dir is , mark file as xattr */
1034 1066                  tp->tn_flags |= ISXATTR;
1035 1067          }
1036 1068  
1037 1069  
1038 1070          if (type == VBLK || type == VCHR) {
1039 1071                  tp->tn_vnode->v_rdev = tp->tn_rdev = va->va_rdev;
1040 1072          } else {
1041 1073                  tp->tn_vnode->v_rdev = tp->tn_rdev = NODEV;
1042 1074          }
1043 1075          tp->tn_vnode->v_type = type;
1044 1076          tp->tn_uid = crgetuid(cred);
1045 1077  
1046 1078          /*
1047 1079           * To determine the group-id of the created file:
1048 1080           *   1) If the gid is set in the attribute list (non-Sun & pre-4.0
1049 1081           *      clients are not likely to set the gid), then use it if
1050 1082           *      the process is privileged, belongs to the target group,
1051 1083           *      or the group is the same as the parent directory.
1052 1084           *   2) If the filesystem was not mounted with the Old-BSD-compatible
1053 1085           *      GRPID option, and the directory's set-gid bit is clear,
1054 1086           *      then use the process's gid.
1055 1087           *   3) Otherwise, set the group-id to the gid of the parent directory.
1056 1088           */
1057 1089          if ((va->va_mask & AT_GID) &&
1058 1090              ((va->va_gid == dir->tn_gid) || groupmember(va->va_gid, cred) ||
1059 1091              secpolicy_vnode_create_gid(cred) == 0)) {
1060 1092                  /*
1061 1093                   * XXX - is this only the case when a 4.0 NFS client, or a
1062 1094                   * client derived from that code, makes a call over the wire?
1063 1095                   */
1064 1096                  tp->tn_gid = va->va_gid;
1065 1097          } else {
1066 1098                  if (dir->tn_mode & VSGID)
1067 1099                          tp->tn_gid = dir->tn_gid;
1068 1100                  else
1069 1101                          tp->tn_gid = crgetgid(cred);
1070 1102          }
1071 1103          /*
1072 1104           * If we're creating a directory, and the parent directory has the
1073 1105           * set-GID bit set, set it on the new directory.
1074 1106           * Otherwise, if the user is neither privileged nor a member of the
1075 1107           * file's new group, clear the file's set-GID bit.
1076 1108           */
1077 1109          if (dir->tn_mode & VSGID && type == VDIR)
1078 1110                  tp->tn_mode |= VSGID;
1079 1111          else {
1080 1112                  if ((tp->tn_mode & VSGID) &&
1081 1113                      secpolicy_vnode_setids_setgids(cred, tp->tn_gid) != 0)
1082 1114                          tp->tn_mode &= ~VSGID;
1083 1115          }
1084 1116  
1085 1117          if (va->va_mask & AT_ATIME)
1086 1118                  tp->tn_atime = va->va_atime;
1087 1119          if (va->va_mask & AT_MTIME)
1088 1120                  tp->tn_mtime = va->va_mtime;
1089 1121  
1090 1122          if (op == DE_MKDIR)
1091 1123                  tdirinit(dir, tp);
1092 1124  
1093 1125          *newnode = tp;
1094 1126          return (0);
1095 1127  }
  
    | ↓ open down ↓ | 430 lines elided | ↑ open up ↑ | 
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX