Print this page
    
4045 zfs write throttle & i/o scheduler performance work
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
    
      
        | Split | Close | 
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/dsl_dir.c
          +++ new/usr/src/uts/common/fs/zfs/dsl_dir.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2013 by Delphix. All rights reserved.
  24   24   * Copyright (c) 2013 Martin Matuska. All rights reserved.
  25   25   */
  26   26  
  27   27  #include <sys/dmu.h>
  28   28  #include <sys/dmu_objset.h>
  29   29  #include <sys/dmu_tx.h>
  30   30  #include <sys/dsl_dataset.h>
  31   31  #include <sys/dsl_dir.h>
  32   32  #include <sys/dsl_prop.h>
  33   33  #include <sys/dsl_synctask.h>
  34   34  #include <sys/dsl_deleg.h>
  35   35  #include <sys/spa.h>
  36   36  #include <sys/metaslab.h>
  37   37  #include <sys/zap.h>
  38   38  #include <sys/zio.h>
  39   39  #include <sys/arc.h>
  40   40  #include <sys/sunddi.h>
  41   41  #include "zfs_namecheck.h"
  42   42  
  43   43  static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
  44   44  
  45   45  /* ARGSUSED */
  46   46  static void
  47   47  dsl_dir_evict(dmu_buf_t *db, void *arg)
  48   48  {
  49   49          dsl_dir_t *dd = arg;
  50   50          dsl_pool_t *dp = dd->dd_pool;
  51   51          int t;
  52   52  
  53   53          for (t = 0; t < TXG_SIZE; t++) {
  54   54                  ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
  55   55                  ASSERT(dd->dd_tempreserved[t] == 0);
  56   56                  ASSERT(dd->dd_space_towrite[t] == 0);
  57   57          }
  58   58  
  59   59          if (dd->dd_parent)
  60   60                  dsl_dir_rele(dd->dd_parent, dd);
  61   61  
  62   62          spa_close(dd->dd_pool->dp_spa, dd);
  63   63  
  64   64          /*
  65   65           * The props callback list should have been cleaned up by
  66   66           * objset_evict().
  67   67           */
  68   68          list_destroy(&dd->dd_prop_cbs);
  69   69          mutex_destroy(&dd->dd_lock);
  70   70          kmem_free(dd, sizeof (dsl_dir_t));
  71   71  }
  72   72  
  73   73  int
  74   74  dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
  75   75      const char *tail, void *tag, dsl_dir_t **ddp)
  76   76  {
  77   77          dmu_buf_t *dbuf;
  78   78          dsl_dir_t *dd;
  79   79          int err;
  80   80  
  81   81          ASSERT(dsl_pool_config_held(dp));
  82   82  
  83   83          err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
  84   84          if (err != 0)
  85   85                  return (err);
  86   86          dd = dmu_buf_get_user(dbuf);
  87   87  #ifdef ZFS_DEBUG
  88   88          {
  89   89                  dmu_object_info_t doi;
  90   90                  dmu_object_info_from_db(dbuf, &doi);
  91   91                  ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DIR);
  92   92                  ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
  93   93          }
  94   94  #endif
  95   95          if (dd == NULL) {
  96   96                  dsl_dir_t *winner;
  97   97  
  98   98                  dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
  99   99                  dd->dd_object = ddobj;
 100  100                  dd->dd_dbuf = dbuf;
 101  101                  dd->dd_pool = dp;
 102  102                  dd->dd_phys = dbuf->db_data;
 103  103                  mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
 104  104  
 105  105                  list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
 106  106                      offsetof(dsl_prop_cb_record_t, cbr_node));
 107  107  
 108  108                  dsl_dir_snap_cmtime_update(dd);
 109  109  
 110  110                  if (dd->dd_phys->dd_parent_obj) {
 111  111                          err = dsl_dir_hold_obj(dp, dd->dd_phys->dd_parent_obj,
 112  112                              NULL, dd, &dd->dd_parent);
 113  113                          if (err != 0)
 114  114                                  goto errout;
 115  115                          if (tail) {
 116  116  #ifdef ZFS_DEBUG
 117  117                                  uint64_t foundobj;
 118  118  
 119  119                                  err = zap_lookup(dp->dp_meta_objset,
 120  120                                      dd->dd_parent->dd_phys->dd_child_dir_zapobj,
 121  121                                      tail, sizeof (foundobj), 1, &foundobj);
 122  122                                  ASSERT(err || foundobj == ddobj);
 123  123  #endif
 124  124                                  (void) strcpy(dd->dd_myname, tail);
 125  125                          } else {
 126  126                                  err = zap_value_search(dp->dp_meta_objset,
 127  127                                      dd->dd_parent->dd_phys->dd_child_dir_zapobj,
 128  128                                      ddobj, 0, dd->dd_myname);
 129  129                          }
 130  130                          if (err != 0)
 131  131                                  goto errout;
 132  132                  } else {
 133  133                          (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
 134  134                  }
 135  135  
 136  136                  if (dsl_dir_is_clone(dd)) {
 137  137                          dmu_buf_t *origin_bonus;
 138  138                          dsl_dataset_phys_t *origin_phys;
 139  139  
 140  140                          /*
 141  141                           * We can't open the origin dataset, because
 142  142                           * that would require opening this dsl_dir.
 143  143                           * Just look at its phys directly instead.
 144  144                           */
 145  145                          err = dmu_bonus_hold(dp->dp_meta_objset,
 146  146                              dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus);
 147  147                          if (err != 0)
 148  148                                  goto errout;
 149  149                          origin_phys = origin_bonus->db_data;
 150  150                          dd->dd_origin_txg =
 151  151                              origin_phys->ds_creation_txg;
 152  152                          dmu_buf_rele(origin_bonus, FTAG);
 153  153                  }
 154  154  
 155  155                  winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
 156  156                      dsl_dir_evict);
 157  157                  if (winner) {
 158  158                          if (dd->dd_parent)
 159  159                                  dsl_dir_rele(dd->dd_parent, dd);
 160  160                          mutex_destroy(&dd->dd_lock);
 161  161                          kmem_free(dd, sizeof (dsl_dir_t));
 162  162                          dd = winner;
 163  163                  } else {
 164  164                          spa_open_ref(dp->dp_spa, dd);
 165  165                  }
 166  166          }
 167  167  
 168  168          /*
 169  169           * The dsl_dir_t has both open-to-close and instantiate-to-evict
 170  170           * holds on the spa.  We need the open-to-close holds because
 171  171           * otherwise the spa_refcnt wouldn't change when we open a
 172  172           * dir which the spa also has open, so we could incorrectly
 173  173           * think it was OK to unload/export/destroy the pool.  We need
 174  174           * the instantiate-to-evict hold because the dsl_dir_t has a
 175  175           * pointer to the dd_pool, which has a pointer to the spa_t.
 176  176           */
 177  177          spa_open_ref(dp->dp_spa, tag);
 178  178          ASSERT3P(dd->dd_pool, ==, dp);
 179  179          ASSERT3U(dd->dd_object, ==, ddobj);
 180  180          ASSERT3P(dd->dd_dbuf, ==, dbuf);
 181  181          *ddp = dd;
 182  182          return (0);
 183  183  
 184  184  errout:
 185  185          if (dd->dd_parent)
 186  186                  dsl_dir_rele(dd->dd_parent, dd);
 187  187          mutex_destroy(&dd->dd_lock);
 188  188          kmem_free(dd, sizeof (dsl_dir_t));
 189  189          dmu_buf_rele(dbuf, tag);
 190  190          return (err);
 191  191  }
 192  192  
 193  193  void
 194  194  dsl_dir_rele(dsl_dir_t *dd, void *tag)
 195  195  {
 196  196          dprintf_dd(dd, "%s\n", "");
 197  197          spa_close(dd->dd_pool->dp_spa, tag);
 198  198          dmu_buf_rele(dd->dd_dbuf, tag);
 199  199  }
 200  200  
 201  201  /* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */
 202  202  void
 203  203  dsl_dir_name(dsl_dir_t *dd, char *buf)
 204  204  {
 205  205          if (dd->dd_parent) {
 206  206                  dsl_dir_name(dd->dd_parent, buf);
 207  207                  (void) strcat(buf, "/");
 208  208          } else {
 209  209                  buf[0] = '\0';
 210  210          }
 211  211          if (!MUTEX_HELD(&dd->dd_lock)) {
 212  212                  /*
 213  213                   * recursive mutex so that we can use
 214  214                   * dprintf_dd() with dd_lock held
 215  215                   */
 216  216                  mutex_enter(&dd->dd_lock);
 217  217                  (void) strcat(buf, dd->dd_myname);
 218  218                  mutex_exit(&dd->dd_lock);
 219  219          } else {
 220  220                  (void) strcat(buf, dd->dd_myname);
 221  221          }
 222  222  }
 223  223  
 224  224  /* Calculate name length, avoiding all the strcat calls of dsl_dir_name */
 225  225  int
 226  226  dsl_dir_namelen(dsl_dir_t *dd)
 227  227  {
 228  228          int result = 0;
 229  229  
 230  230          if (dd->dd_parent) {
 231  231                  /* parent's name + 1 for the "/" */
 232  232                  result = dsl_dir_namelen(dd->dd_parent) + 1;
 233  233          }
 234  234  
 235  235          if (!MUTEX_HELD(&dd->dd_lock)) {
 236  236                  /* see dsl_dir_name */
 237  237                  mutex_enter(&dd->dd_lock);
 238  238                  result += strlen(dd->dd_myname);
 239  239                  mutex_exit(&dd->dd_lock);
 240  240          } else {
 241  241                  result += strlen(dd->dd_myname);
 242  242          }
 243  243  
 244  244          return (result);
 245  245  }
 246  246  
 247  247  static int
 248  248  getcomponent(const char *path, char *component, const char **nextp)
 249  249  {
 250  250          char *p;
 251  251  
 252  252          if ((path == NULL) || (path[0] == '\0'))
 253  253                  return (SET_ERROR(ENOENT));
 254  254          /* This would be a good place to reserve some namespace... */
 255  255          p = strpbrk(path, "/@");
 256  256          if (p && (p[1] == '/' || p[1] == '@')) {
 257  257                  /* two separators in a row */
 258  258                  return (SET_ERROR(EINVAL));
 259  259          }
 260  260          if (p == NULL || p == path) {
 261  261                  /*
 262  262                   * if the first thing is an @ or /, it had better be an
 263  263                   * @ and it had better not have any more ats or slashes,
 264  264                   * and it had better have something after the @.
 265  265                   */
 266  266                  if (p != NULL &&
 267  267                      (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
 268  268                          return (SET_ERROR(EINVAL));
 269  269                  if (strlen(path) >= MAXNAMELEN)
 270  270                          return (SET_ERROR(ENAMETOOLONG));
 271  271                  (void) strcpy(component, path);
 272  272                  p = NULL;
 273  273          } else if (p[0] == '/') {
 274  274                  if (p - path >= MAXNAMELEN)
 275  275                          return (SET_ERROR(ENAMETOOLONG));
 276  276                  (void) strncpy(component, path, p - path);
 277  277                  component[p - path] = '\0';
 278  278                  p++;
 279  279          } else if (p[0] == '@') {
 280  280                  /*
 281  281                   * if the next separator is an @, there better not be
 282  282                   * any more slashes.
 283  283                   */
 284  284                  if (strchr(path, '/'))
 285  285                          return (SET_ERROR(EINVAL));
 286  286                  if (p - path >= MAXNAMELEN)
 287  287                          return (SET_ERROR(ENAMETOOLONG));
 288  288                  (void) strncpy(component, path, p - path);
 289  289                  component[p - path] = '\0';
 290  290          } else {
 291  291                  panic("invalid p=%p", (void *)p);
 292  292          }
 293  293          *nextp = p;
 294  294          return (0);
 295  295  }
 296  296  
 297  297  /*
 298  298   * Return the dsl_dir_t, and possibly the last component which couldn't
 299  299   * be found in *tail.  The name must be in the specified dsl_pool_t.  This
 300  300   * thread must hold the dp_config_rwlock for the pool.  Returns NULL if the
 301  301   * path is bogus, or if tail==NULL and we couldn't parse the whole name.
 302  302   * (*tail)[0] == '@' means that the last component is a snapshot.
 303  303   */
 304  304  int
 305  305  dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
 306  306      dsl_dir_t **ddp, const char **tailp)
 307  307  {
 308  308          char buf[MAXNAMELEN];
 309  309          const char *spaname, *next, *nextnext = NULL;
 310  310          int err;
 311  311          dsl_dir_t *dd;
 312  312          uint64_t ddobj;
 313  313  
 314  314          err = getcomponent(name, buf, &next);
 315  315          if (err != 0)
 316  316                  return (err);
 317  317  
 318  318          /* Make sure the name is in the specified pool. */
 319  319          spaname = spa_name(dp->dp_spa);
 320  320          if (strcmp(buf, spaname) != 0)
 321  321                  return (SET_ERROR(EINVAL));
 322  322  
 323  323          ASSERT(dsl_pool_config_held(dp));
 324  324  
 325  325          err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
 326  326          if (err != 0) {
 327  327                  return (err);
 328  328          }
 329  329  
 330  330          while (next != NULL) {
 331  331                  dsl_dir_t *child_ds;
 332  332                  err = getcomponent(next, buf, &nextnext);
 333  333                  if (err != 0)
 334  334                          break;
 335  335                  ASSERT(next[0] != '\0');
 336  336                  if (next[0] == '@')
 337  337                          break;
 338  338                  dprintf("looking up %s in obj%lld\n",
 339  339                      buf, dd->dd_phys->dd_child_dir_zapobj);
 340  340  
 341  341                  err = zap_lookup(dp->dp_meta_objset,
 342  342                      dd->dd_phys->dd_child_dir_zapobj,
 343  343                      buf, sizeof (ddobj), 1, &ddobj);
 344  344                  if (err != 0) {
 345  345                          if (err == ENOENT)
 346  346                                  err = 0;
 347  347                          break;
 348  348                  }
 349  349  
 350  350                  err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_ds);
 351  351                  if (err != 0)
 352  352                          break;
 353  353                  dsl_dir_rele(dd, tag);
 354  354                  dd = child_ds;
 355  355                  next = nextnext;
 356  356          }
 357  357  
 358  358          if (err != 0) {
 359  359                  dsl_dir_rele(dd, tag);
 360  360                  return (err);
 361  361          }
 362  362  
 363  363          /*
 364  364           * It's an error if there's more than one component left, or
 365  365           * tailp==NULL and there's any component left.
 366  366           */
 367  367          if (next != NULL &&
 368  368              (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
 369  369                  /* bad path name */
 370  370                  dsl_dir_rele(dd, tag);
 371  371                  dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
 372  372                  err = SET_ERROR(ENOENT);
 373  373          }
 374  374          if (tailp != NULL)
 375  375                  *tailp = next;
 376  376          *ddp = dd;
 377  377          return (err);
 378  378  }
 379  379  
 380  380  uint64_t
 381  381  dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
 382  382      dmu_tx_t *tx)
 383  383  {
 384  384          objset_t *mos = dp->dp_meta_objset;
 385  385          uint64_t ddobj;
 386  386          dsl_dir_phys_t *ddphys;
 387  387          dmu_buf_t *dbuf;
 388  388  
 389  389          ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
 390  390              DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
 391  391          if (pds) {
 392  392                  VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
 393  393                      name, sizeof (uint64_t), 1, &ddobj, tx));
 394  394          } else {
 395  395                  /* it's the root dir */
 396  396                  VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
 397  397                      DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx));
 398  398          }
 399  399          VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
 400  400          dmu_buf_will_dirty(dbuf, tx);
 401  401          ddphys = dbuf->db_data;
 402  402  
 403  403          ddphys->dd_creation_time = gethrestime_sec();
 404  404          if (pds)
 405  405                  ddphys->dd_parent_obj = pds->dd_object;
 406  406          ddphys->dd_props_zapobj = zap_create(mos,
 407  407              DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
 408  408          ddphys->dd_child_dir_zapobj = zap_create(mos,
 409  409              DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
 410  410          if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN)
 411  411                  ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN;
 412  412          dmu_buf_rele(dbuf, FTAG);
 413  413  
 414  414          return (ddobj);
 415  415  }
 416  416  
 417  417  boolean_t
 418  418  dsl_dir_is_clone(dsl_dir_t *dd)
 419  419  {
 420  420          return (dd->dd_phys->dd_origin_obj &&
 421  421              (dd->dd_pool->dp_origin_snap == NULL ||
 422  422              dd->dd_phys->dd_origin_obj !=
 423  423              dd->dd_pool->dp_origin_snap->ds_object));
 424  424  }
 425  425  
 426  426  void
 427  427  dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv)
 428  428  {
 429  429          mutex_enter(&dd->dd_lock);
 430  430          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
 431  431              dd->dd_phys->dd_used_bytes);
 432  432          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dd->dd_phys->dd_quota);
 433  433          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
 434  434              dd->dd_phys->dd_reserved);
 435  435          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
 436  436              dd->dd_phys->dd_compressed_bytes == 0 ? 100 :
 437  437              (dd->dd_phys->dd_uncompressed_bytes * 100 /
 438  438              dd->dd_phys->dd_compressed_bytes));
 439  439          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED,
 440  440              dd->dd_phys->dd_uncompressed_bytes);
 441  441          if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
 442  442                  dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
 443  443                      dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]);
 444  444                  dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
 445  445                      dd->dd_phys->dd_used_breakdown[DD_USED_HEAD]);
 446  446                  dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
 447  447                      dd->dd_phys->dd_used_breakdown[DD_USED_REFRSRV]);
 448  448                  dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
 449  449                      dd->dd_phys->dd_used_breakdown[DD_USED_CHILD] +
 450  450                      dd->dd_phys->dd_used_breakdown[DD_USED_CHILD_RSRV]);
 451  451          }
 452  452          mutex_exit(&dd->dd_lock);
 453  453  
 454  454          if (dsl_dir_is_clone(dd)) {
 455  455                  dsl_dataset_t *ds;
 456  456                  char buf[MAXNAMELEN];
 457  457  
 458  458                  VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
 459  459                      dd->dd_phys->dd_origin_obj, FTAG, &ds));
 460  460                  dsl_dataset_name(ds, buf);
 461  461                  dsl_dataset_rele(ds, FTAG);
 462  462                  dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
 463  463          }
 464  464  }
 465  465  
 466  466  void
 467  467  dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx)
 468  468  {
 469  469          dsl_pool_t *dp = dd->dd_pool;
 470  470  
 471  471          ASSERT(dd->dd_phys);
 472  472  
 473  473          if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) {
 474  474                  /* up the hold count until we can be written out */
 475  475                  dmu_buf_add_ref(dd->dd_dbuf, dd);
 476  476          }
 477  477  }
 478  478  
 479  479  static int64_t
 480  480  parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
 481  481  {
 482  482          uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved);
 483  483          uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved);
 484  484          return (new_accounted - old_accounted);
 485  485  }
 486  486  
 487  487  void
 488  488  dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
 489  489  {
 490  490          ASSERT(dmu_tx_is_syncing(tx));
 491  491  
 492  492          mutex_enter(&dd->dd_lock);
 493  493          ASSERT0(dd->dd_tempreserved[tx->tx_txg&TXG_MASK]);
 494  494          dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg,
 495  495              dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024);
 496  496          dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0;
 497  497          mutex_exit(&dd->dd_lock);
 498  498  
 499  499          /* release the hold from dsl_dir_dirty */
 500  500          dmu_buf_rele(dd->dd_dbuf, dd);
 501  501  }
 502  502  
 503  503  static uint64_t
 504  504  dsl_dir_space_towrite(dsl_dir_t *dd)
 505  505  {
 506  506          uint64_t space = 0;
 507  507          int i;
 508  508  
 509  509          ASSERT(MUTEX_HELD(&dd->dd_lock));
 510  510  
 511  511          for (i = 0; i < TXG_SIZE; i++) {
 512  512                  space += dd->dd_space_towrite[i&TXG_MASK];
 513  513                  ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0);
 514  514          }
 515  515          return (space);
 516  516  }
 517  517  
 518  518  /*
 519  519   * How much space would dd have available if ancestor had delta applied
 520  520   * to it?  If ondiskonly is set, we're only interested in what's
 521  521   * on-disk, not estimated pending changes.
 522  522   */
 523  523  uint64_t
 524  524  dsl_dir_space_available(dsl_dir_t *dd,
 525  525      dsl_dir_t *ancestor, int64_t delta, int ondiskonly)
 526  526  {
 527  527          uint64_t parentspace, myspace, quota, used;
 528  528  
 529  529          /*
 530  530           * If there are no restrictions otherwise, assume we have
 531  531           * unlimited space available.
 532  532           */
 533  533          quota = UINT64_MAX;
 534  534          parentspace = UINT64_MAX;
 535  535  
 536  536          if (dd->dd_parent != NULL) {
 537  537                  parentspace = dsl_dir_space_available(dd->dd_parent,
 538  538                      ancestor, delta, ondiskonly);
 539  539          }
 540  540  
 541  541          mutex_enter(&dd->dd_lock);
 542  542          if (dd->dd_phys->dd_quota != 0)
 543  543                  quota = dd->dd_phys->dd_quota;
 544  544          used = dd->dd_phys->dd_used_bytes;
 545  545          if (!ondiskonly)
 546  546                  used += dsl_dir_space_towrite(dd);
 547  547  
 548  548          if (dd->dd_parent == NULL) {
 549  549                  uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE);
 550  550                  quota = MIN(quota, poolsize);
 551  551          }
 552  552  
 553  553          if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) {
 554  554                  /*
 555  555                   * We have some space reserved, in addition to what our
 556  556                   * parent gave us.
 557  557                   */
 558  558                  parentspace += dd->dd_phys->dd_reserved - used;
 559  559          }
 560  560  
 561  561          if (dd == ancestor) {
 562  562                  ASSERT(delta <= 0);
 563  563                  ASSERT(used >= -delta);
 564  564                  used += delta;
 565  565                  if (parentspace != UINT64_MAX)
 566  566                          parentspace -= delta;
 567  567          }
 568  568  
 569  569          if (used > quota) {
 570  570                  /* over quota */
 571  571                  myspace = 0;
 572  572          } else {
 573  573                  /*
 574  574                   * the lesser of the space provided by our parent and
 575  575                   * the space left in our quota
 576  576                   */
  
    | ↓ open down ↓ | 576 lines elided | ↑ open up ↑ | 
 577  577                  myspace = MIN(parentspace, quota - used);
 578  578          }
 579  579  
 580  580          mutex_exit(&dd->dd_lock);
 581  581  
 582  582          return (myspace);
 583  583  }
 584  584  
 585  585  struct tempreserve {
 586  586          list_node_t tr_node;
 587      -        dsl_pool_t *tr_dp;
 588  587          dsl_dir_t *tr_ds;
 589  588          uint64_t tr_size;
 590  589  };
 591  590  
 592  591  static int
 593  592  dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
 594  593      boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list,
 595  594      dmu_tx_t *tx, boolean_t first)
 596  595  {
 597  596          uint64_t txg = tx->tx_txg;
 598  597          uint64_t est_inflight, used_on_disk, quota, parent_rsrv;
 599  598          uint64_t deferred = 0;
 600  599          struct tempreserve *tr;
 601  600          int retval = EDQUOT;
 602  601          int txgidx = txg & TXG_MASK;
 603  602          int i;
 604  603          uint64_t ref_rsrv = 0;
 605  604  
 606  605          ASSERT3U(txg, !=, 0);
 607  606          ASSERT3S(asize, >, 0);
 608  607  
 609  608          mutex_enter(&dd->dd_lock);
 610  609  
 611  610          /*
 612  611           * Check against the dsl_dir's quota.  We don't add in the delta
 613  612           * when checking for over-quota because they get one free hit.
 614  613           */
 615  614          est_inflight = dsl_dir_space_towrite(dd);
 616  615          for (i = 0; i < TXG_SIZE; i++)
 617  616                  est_inflight += dd->dd_tempreserved[i];
 618  617          used_on_disk = dd->dd_phys->dd_used_bytes;
 619  618  
 620  619          /*
 621  620           * On the first iteration, fetch the dataset's used-on-disk and
 622  621           * refreservation values. Also, if checkrefquota is set, test if
 623  622           * allocating this space would exceed the dataset's refquota.
 624  623           */
 625  624          if (first && tx->tx_objset) {
 626  625                  int error;
 627  626                  dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
 628  627  
 629  628                  error = dsl_dataset_check_quota(ds, checkrefquota,
 630  629                      asize, est_inflight, &used_on_disk, &ref_rsrv);
 631  630                  if (error) {
 632  631                          mutex_exit(&dd->dd_lock);
 633  632                          return (error);
 634  633                  }
 635  634          }
 636  635  
 637  636          /*
 638  637           * If this transaction will result in a net free of space,
 639  638           * we want to let it through.
 640  639           */
 641  640          if (ignorequota || netfree || dd->dd_phys->dd_quota == 0)
 642  641                  quota = UINT64_MAX;
 643  642          else
 644  643                  quota = dd->dd_phys->dd_quota;
 645  644  
 646  645          /*
 647  646           * Adjust the quota against the actual pool size at the root
 648  647           * minus any outstanding deferred frees.
 649  648           * To ensure that it's possible to remove files from a full
 650  649           * pool without inducing transient overcommits, we throttle
 651  650           * netfree transactions against a quota that is slightly larger,
 652  651           * but still within the pool's allocation slop.  In cases where
 653  652           * we're very close to full, this will allow a steady trickle of
 654  653           * removes to get through.
 655  654           */
 656  655          if (dd->dd_parent == NULL) {
 657  656                  spa_t *spa = dd->dd_pool->dp_spa;
 658  657                  uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
 659  658                  deferred = metaslab_class_get_deferred(spa_normal_class(spa));
 660  659                  if (poolsize - deferred < quota) {
 661  660                          quota = poolsize - deferred;
 662  661                          retval = ENOSPC;
 663  662                  }
 664  663          }
 665  664  
 666  665          /*
 667  666           * If they are requesting more space, and our current estimate
 668  667           * is over quota, they get to try again unless the actual
 669  668           * on-disk is over quota and there are no pending changes (which
 670  669           * may free up space for us).
 671  670           */
 672  671          if (used_on_disk + est_inflight >= quota) {
 673  672                  if (est_inflight > 0 || used_on_disk < quota ||
 674  673                      (retval == ENOSPC && used_on_disk < quota + deferred))
 675  674                          retval = ERESTART;
 676  675                  dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
 677  676                      "quota=%lluK tr=%lluK err=%d\n",
 678  677                      used_on_disk>>10, est_inflight>>10,
 679  678                      quota>>10, asize>>10, retval);
 680  679                  mutex_exit(&dd->dd_lock);
 681  680                  return (SET_ERROR(retval));
 682  681          }
 683  682  
 684  683          /* We need to up our estimated delta before dropping dd_lock */
 685  684          dd->dd_tempreserved[txgidx] += asize;
 686  685  
 687  686          parent_rsrv = parent_delta(dd, used_on_disk + est_inflight,
 688  687              asize - ref_rsrv);
 689  688          mutex_exit(&dd->dd_lock);
 690  689  
 691  690          tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
 692  691          tr->tr_ds = dd;
 693  692          tr->tr_size = asize;
 694  693          list_insert_tail(tr_list, tr);
 695  694  
 696  695          /* see if it's OK with our parent */
 697  696          if (dd->dd_parent && parent_rsrv) {
 698  697                  boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0);
 699  698  
 700  699                  return (dsl_dir_tempreserve_impl(dd->dd_parent,
 701  700                      parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE));
 702  701          } else {
 703  702                  return (0);
 704  703          }
 705  704  }
 706  705  
 707  706  /*
 708  707   * Reserve space in this dsl_dir, to be used in this tx's txg.
 709  708   * After the space has been dirtied (and dsl_dir_willuse_space()
 710  709   * has been called), the reservation should be canceled, using
 711  710   * dsl_dir_tempreserve_clear().
 712  711   */
 713  712  int
 714  713  dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize,
 715  714      uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx)
 716  715  {
 717  716          int err;
 718  717          list_t *tr_list;
 719  718  
 720  719          if (asize == 0) {
 721  720                  *tr_cookiep = NULL;
 722  721                  return (0);
 723  722          }
 724  723  
 725  724          tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
 726  725          list_create(tr_list, sizeof (struct tempreserve),
 727  726              offsetof(struct tempreserve, tr_node));
  
    | ↓ open down ↓ | 130 lines elided | ↑ open up ↑ | 
 728  727          ASSERT3S(asize, >, 0);
 729  728          ASSERT3S(fsize, >=, 0);
 730  729  
 731  730          err = arc_tempreserve_space(lsize, tx->tx_txg);
 732  731          if (err == 0) {
 733  732                  struct tempreserve *tr;
 734  733  
 735  734                  tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
 736  735                  tr->tr_size = lsize;
 737  736                  list_insert_tail(tr_list, tr);
 738      -
 739      -                err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx);
 740  737          } else {
 741  738                  if (err == EAGAIN) {
      739 +                        /*
      740 +                         * If arc_memory_throttle() detected that pageout
      741 +                         * is running and we are low on memory, we delay new
      742 +                         * non-pageout transactions to give pageout an
      743 +                         * advantage.
      744 +                         *
      745 +                         * It is unfortunate to be delaying while the caller's
      746 +                         * locks are held.
      747 +                         */
 742  748                          txg_delay(dd->dd_pool, tx->tx_txg,
 743  749                              MSEC2NSEC(10), MSEC2NSEC(10));
 744  750                          err = SET_ERROR(ERESTART);
 745  751                  }
 746      -                dsl_pool_memory_pressure(dd->dd_pool);
 747  752          }
 748  753  
 749  754          if (err == 0) {
 750      -                struct tempreserve *tr;
 751      -
 752      -                tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
 753      -                tr->tr_dp = dd->dd_pool;
 754      -                tr->tr_size = asize;
 755      -                list_insert_tail(tr_list, tr);
 756      -
 757  755                  err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
 758  756                      FALSE, asize > usize, tr_list, tx, TRUE);
 759  757          }
 760  758  
 761  759          if (err != 0)
 762  760                  dsl_dir_tempreserve_clear(tr_list, tx);
 763  761          else
 764  762                  *tr_cookiep = tr_list;
 765  763  
 766  764          return (err);
 767  765  }
 768  766  
 769  767  /*
 770  768   * Clear a temporary reservation that we previously made with
 771  769   * dsl_dir_tempreserve_space().
 772  770   */
 773  771  void
 774  772  dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx)
  
    | ↓ open down ↓ | 8 lines elided | ↑ open up ↑ | 
 775  773  {
 776  774          int txgidx = tx->tx_txg & TXG_MASK;
 777  775          list_t *tr_list = tr_cookie;
 778  776          struct tempreserve *tr;
 779  777  
 780  778          ASSERT3U(tx->tx_txg, !=, 0);
 781  779  
 782  780          if (tr_cookie == NULL)
 783  781                  return;
 784  782  
 785      -        while (tr = list_head(tr_list)) {
 786      -                if (tr->tr_dp) {
 787      -                        dsl_pool_tempreserve_clear(tr->tr_dp, tr->tr_size, tx);
 788      -                } else if (tr->tr_ds) {
      783 +        while ((tr = list_head(tr_list)) != NULL) {
      784 +                if (tr->tr_ds) {
 789  785                          mutex_enter(&tr->tr_ds->dd_lock);
 790  786                          ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
 791  787                              tr->tr_size);
 792  788                          tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size;
 793  789                          mutex_exit(&tr->tr_ds->dd_lock);
 794  790                  } else {
 795  791                          arc_tempreserve_clear(tr->tr_size);
 796  792                  }
 797  793                  list_remove(tr_list, tr);
 798  794                  kmem_free(tr, sizeof (struct tempreserve));
 799  795          }
 800  796  
 801  797          kmem_free(tr_list, sizeof (list_t));
 802  798  }
 803  799  
 804      -static void
 805      -dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
      800 +/*
      801 + * This should be called from open context when we think we're going to write
      802 + * or free space, for example when dirtying data. Be conservative; it's okay
      803 + * to write less space or free more, but we don't want to write more or free
      804 + * less than the amount specified.
      805 + */
      806 +void
      807 +dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
 806  808  {
 807  809          int64_t parent_space;
 808  810          uint64_t est_used;
 809  811  
 810  812          mutex_enter(&dd->dd_lock);
 811  813          if (space > 0)
 812  814                  dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
 813  815  
 814  816          est_used = dsl_dir_space_towrite(dd) + dd->dd_phys->dd_used_bytes;
 815  817          parent_space = parent_delta(dd, est_used, space);
 816  818          mutex_exit(&dd->dd_lock);
 817  819  
 818  820          /* Make sure that we clean up dd_space_to* */
 819  821          dsl_dir_dirty(dd, tx);
 820  822  
 821  823          /* XXX this is potentially expensive and unnecessary... */
 822  824          if (parent_space && dd->dd_parent)
 823      -                dsl_dir_willuse_space_impl(dd->dd_parent, parent_space, tx);
      825 +                dsl_dir_willuse_space(dd->dd_parent, parent_space, tx);
 824  826  }
 825  827  
 826      -/*
 827      - * Call in open context when we think we're going to write/free space,
 828      - * eg. when dirtying data.  Be conservative (ie. OK to write less than
 829      - * this or free more than this, but don't write more or free less).
 830      - */
 831      -void
 832      -dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
 833      -{
 834      -        dsl_pool_willuse_space(dd->dd_pool, space, tx);
 835      -        dsl_dir_willuse_space_impl(dd, space, tx);
 836      -}
 837      -
 838  828  /* call from syncing context when we actually write/free space for this dd */
 839  829  void
 840  830  dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
 841  831      int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
 842  832  {
 843  833          int64_t accounted_delta;
 844  834  
 845  835          /*
 846  836           * dsl_dataset_set_refreservation_sync_impl() calls this with
 847  837           * dd_lock held, so that it can atomically update
 848  838           * ds->ds_reserved and the dsl_dir accounting, so that
 849  839           * dsl_dataset_check_quota() can see dataset and dir accounting
 850  840           * consistently.
 851  841           */
 852  842          boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
 853  843  
 854  844          ASSERT(dmu_tx_is_syncing(tx));
 855  845          ASSERT(type < DD_USED_NUM);
 856  846  
 857  847          dmu_buf_will_dirty(dd->dd_dbuf, tx);
 858  848  
 859  849          if (needlock)
 860  850                  mutex_enter(&dd->dd_lock);
 861  851          accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used);
 862  852          ASSERT(used >= 0 || dd->dd_phys->dd_used_bytes >= -used);
 863  853          ASSERT(compressed >= 0 ||
 864  854              dd->dd_phys->dd_compressed_bytes >= -compressed);
 865  855          ASSERT(uncompressed >= 0 ||
 866  856              dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
 867  857          dd->dd_phys->dd_used_bytes += used;
 868  858          dd->dd_phys->dd_uncompressed_bytes += uncompressed;
 869  859          dd->dd_phys->dd_compressed_bytes += compressed;
 870  860  
 871  861          if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
 872  862                  ASSERT(used > 0 ||
 873  863                      dd->dd_phys->dd_used_breakdown[type] >= -used);
 874  864                  dd->dd_phys->dd_used_breakdown[type] += used;
 875  865  #ifdef DEBUG
 876  866                  dd_used_t t;
 877  867                  uint64_t u = 0;
 878  868                  for (t = 0; t < DD_USED_NUM; t++)
 879  869                          u += dd->dd_phys->dd_used_breakdown[t];
 880  870                  ASSERT3U(u, ==, dd->dd_phys->dd_used_bytes);
 881  871  #endif
 882  872          }
 883  873          if (needlock)
 884  874                  mutex_exit(&dd->dd_lock);
 885  875  
 886  876          if (dd->dd_parent != NULL) {
 887  877                  dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
 888  878                      accounted_delta, compressed, uncompressed, tx);
 889  879                  dsl_dir_transfer_space(dd->dd_parent,
 890  880                      used - accounted_delta,
 891  881                      DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
 892  882          }
 893  883  }
 894  884  
 895  885  void
 896  886  dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
 897  887      dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
 898  888  {
 899  889          ASSERT(dmu_tx_is_syncing(tx));
 900  890          ASSERT(oldtype < DD_USED_NUM);
 901  891          ASSERT(newtype < DD_USED_NUM);
 902  892  
 903  893          if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN))
 904  894                  return;
 905  895  
 906  896          dmu_buf_will_dirty(dd->dd_dbuf, tx);
 907  897          mutex_enter(&dd->dd_lock);
 908  898          ASSERT(delta > 0 ?
 909  899              dd->dd_phys->dd_used_breakdown[oldtype] >= delta :
 910  900              dd->dd_phys->dd_used_breakdown[newtype] >= -delta);
 911  901          ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta));
 912  902          dd->dd_phys->dd_used_breakdown[oldtype] -= delta;
 913  903          dd->dd_phys->dd_used_breakdown[newtype] += delta;
 914  904          mutex_exit(&dd->dd_lock);
 915  905  }
 916  906  
 917  907  typedef struct dsl_dir_set_qr_arg {
 918  908          const char *ddsqra_name;
 919  909          zprop_source_t ddsqra_source;
 920  910          uint64_t ddsqra_value;
 921  911  } dsl_dir_set_qr_arg_t;
 922  912  
 923  913  static int
 924  914  dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx)
 925  915  {
 926  916          dsl_dir_set_qr_arg_t *ddsqra = arg;
 927  917          dsl_pool_t *dp = dmu_tx_pool(tx);
 928  918          dsl_dataset_t *ds;
 929  919          int error;
 930  920          uint64_t towrite, newval;
 931  921  
 932  922          error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
 933  923          if (error != 0)
 934  924                  return (error);
 935  925  
 936  926          error = dsl_prop_predict(ds->ds_dir, "quota",
 937  927              ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
 938  928          if (error != 0) {
 939  929                  dsl_dataset_rele(ds, FTAG);
 940  930                  return (error);
 941  931          }
 942  932  
 943  933          if (newval == 0) {
 944  934                  dsl_dataset_rele(ds, FTAG);
 945  935                  return (0);
 946  936          }
 947  937  
 948  938          mutex_enter(&ds->ds_dir->dd_lock);
 949  939          /*
 950  940           * If we are doing the preliminary check in open context, and
 951  941           * there are pending changes, then don't fail it, since the
 952  942           * pending changes could under-estimate the amount of space to be
 953  943           * freed up.
 954  944           */
 955  945          towrite = dsl_dir_space_towrite(ds->ds_dir);
 956  946          if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
 957  947              (newval < ds->ds_dir->dd_phys->dd_reserved ||
 958  948              newval < ds->ds_dir->dd_phys->dd_used_bytes + towrite)) {
 959  949                  error = SET_ERROR(ENOSPC);
 960  950          }
 961  951          mutex_exit(&ds->ds_dir->dd_lock);
 962  952          dsl_dataset_rele(ds, FTAG);
 963  953          return (error);
 964  954  }
 965  955  
 966  956  static void
 967  957  dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx)
 968  958  {
 969  959          dsl_dir_set_qr_arg_t *ddsqra = arg;
 970  960          dsl_pool_t *dp = dmu_tx_pool(tx);
 971  961          dsl_dataset_t *ds;
 972  962          uint64_t newval;
 973  963  
 974  964          VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
 975  965  
 976  966          if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
 977  967                  dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA),
 978  968                      ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
 979  969                      &ddsqra->ddsqra_value, tx);
 980  970  
 981  971                  VERIFY0(dsl_prop_get_int_ds(ds,
 982  972                      zfs_prop_to_name(ZFS_PROP_QUOTA), &newval));
 983  973          } else {
 984  974                  newval = ddsqra->ddsqra_value;
 985  975                  spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
 986  976                      zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval);
 987  977          }
 988  978  
 989  979          dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
 990  980          mutex_enter(&ds->ds_dir->dd_lock);
 991  981          ds->ds_dir->dd_phys->dd_quota = newval;
 992  982          mutex_exit(&ds->ds_dir->dd_lock);
 993  983          dsl_dataset_rele(ds, FTAG);
 994  984  }
 995  985  
 996  986  int
 997  987  dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
 998  988  {
 999  989          dsl_dir_set_qr_arg_t ddsqra;
1000  990  
1001  991          ddsqra.ddsqra_name = ddname;
1002  992          ddsqra.ddsqra_source = source;
1003  993          ddsqra.ddsqra_value = quota;
1004  994  
1005  995          return (dsl_sync_task(ddname, dsl_dir_set_quota_check,
1006  996              dsl_dir_set_quota_sync, &ddsqra, 0));
1007  997  }
1008  998  
1009  999  int
1010 1000  dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx)
1011 1001  {
1012 1002          dsl_dir_set_qr_arg_t *ddsqra = arg;
1013 1003          dsl_pool_t *dp = dmu_tx_pool(tx);
1014 1004          dsl_dataset_t *ds;
1015 1005          dsl_dir_t *dd;
1016 1006          uint64_t newval, used, avail;
1017 1007          int error;
1018 1008  
1019 1009          error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds);
1020 1010          if (error != 0)
1021 1011                  return (error);
1022 1012          dd = ds->ds_dir;
1023 1013  
1024 1014          /*
1025 1015           * If we are doing the preliminary check in open context, the
1026 1016           * space estimates may be inaccurate.
1027 1017           */
1028 1018          if (!dmu_tx_is_syncing(tx)) {
1029 1019                  dsl_dataset_rele(ds, FTAG);
1030 1020                  return (0);
1031 1021          }
1032 1022  
1033 1023          error = dsl_prop_predict(ds->ds_dir,
1034 1024              zfs_prop_to_name(ZFS_PROP_RESERVATION),
1035 1025              ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval);
1036 1026          if (error != 0) {
1037 1027                  dsl_dataset_rele(ds, FTAG);
1038 1028                  return (error);
1039 1029          }
1040 1030  
1041 1031          mutex_enter(&dd->dd_lock);
1042 1032          used = dd->dd_phys->dd_used_bytes;
1043 1033          mutex_exit(&dd->dd_lock);
1044 1034  
1045 1035          if (dd->dd_parent) {
1046 1036                  avail = dsl_dir_space_available(dd->dd_parent,
1047 1037                      NULL, 0, FALSE);
1048 1038          } else {
1049 1039                  avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
1050 1040          }
1051 1041  
1052 1042          if (MAX(used, newval) > MAX(used, dd->dd_phys->dd_reserved)) {
1053 1043                  uint64_t delta = MAX(used, newval) -
1054 1044                      MAX(used, dd->dd_phys->dd_reserved);
1055 1045  
1056 1046                  if (delta > avail ||
1057 1047                      (dd->dd_phys->dd_quota > 0 &&
1058 1048                      newval > dd->dd_phys->dd_quota))
1059 1049                          error = SET_ERROR(ENOSPC);
1060 1050          }
1061 1051  
1062 1052          dsl_dataset_rele(ds, FTAG);
1063 1053          return (error);
1064 1054  }
1065 1055  
1066 1056  void
1067 1057  dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx)
1068 1058  {
1069 1059          uint64_t used;
1070 1060          int64_t delta;
1071 1061  
1072 1062          dmu_buf_will_dirty(dd->dd_dbuf, tx);
1073 1063  
1074 1064          mutex_enter(&dd->dd_lock);
1075 1065          used = dd->dd_phys->dd_used_bytes;
1076 1066          delta = MAX(used, value) - MAX(used, dd->dd_phys->dd_reserved);
1077 1067          dd->dd_phys->dd_reserved = value;
1078 1068  
1079 1069          if (dd->dd_parent != NULL) {
1080 1070                  /* Roll up this additional usage into our ancestors */
1081 1071                  dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
1082 1072                      delta, 0, 0, tx);
1083 1073          }
1084 1074          mutex_exit(&dd->dd_lock);
1085 1075  }
1086 1076  
1087 1077  
1088 1078  static void
1089 1079  dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx)
1090 1080  {
1091 1081          dsl_dir_set_qr_arg_t *ddsqra = arg;
1092 1082          dsl_pool_t *dp = dmu_tx_pool(tx);
1093 1083          dsl_dataset_t *ds;
1094 1084          uint64_t newval;
1095 1085  
1096 1086          VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds));
1097 1087  
1098 1088          if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) {
1099 1089                  dsl_prop_set_sync_impl(ds,
1100 1090                      zfs_prop_to_name(ZFS_PROP_RESERVATION),
1101 1091                      ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1,
1102 1092                      &ddsqra->ddsqra_value, tx);
1103 1093  
1104 1094                  VERIFY0(dsl_prop_get_int_ds(ds,
1105 1095                      zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval));
1106 1096          } else {
1107 1097                  newval = ddsqra->ddsqra_value;
1108 1098                  spa_history_log_internal_ds(ds, "set", tx, "%s=%lld",
1109 1099                      zfs_prop_to_name(ZFS_PROP_RESERVATION),
1110 1100                      (longlong_t)newval);
1111 1101          }
1112 1102  
1113 1103          dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx);
1114 1104          dsl_dataset_rele(ds, FTAG);
1115 1105  }
1116 1106  
1117 1107  int
1118 1108  dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
1119 1109      uint64_t reservation)
1120 1110  {
1121 1111          dsl_dir_set_qr_arg_t ddsqra;
1122 1112  
1123 1113          ddsqra.ddsqra_name = ddname;
1124 1114          ddsqra.ddsqra_source = source;
1125 1115          ddsqra.ddsqra_value = reservation;
1126 1116  
1127 1117          return (dsl_sync_task(ddname, dsl_dir_set_reservation_check,
1128 1118              dsl_dir_set_reservation_sync, &ddsqra, 0));
1129 1119  }
1130 1120  
1131 1121  static dsl_dir_t *
1132 1122  closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2)
1133 1123  {
1134 1124          for (; ds1; ds1 = ds1->dd_parent) {
1135 1125                  dsl_dir_t *dd;
1136 1126                  for (dd = ds2; dd; dd = dd->dd_parent) {
1137 1127                          if (ds1 == dd)
1138 1128                                  return (dd);
1139 1129                  }
1140 1130          }
1141 1131          return (NULL);
1142 1132  }
1143 1133  
1144 1134  /*
1145 1135   * If delta is applied to dd, how much of that delta would be applied to
1146 1136   * ancestor?  Syncing context only.
1147 1137   */
1148 1138  static int64_t
1149 1139  would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor)
1150 1140  {
1151 1141          if (dd == ancestor)
1152 1142                  return (delta);
1153 1143  
1154 1144          mutex_enter(&dd->dd_lock);
1155 1145          delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, delta);
1156 1146          mutex_exit(&dd->dd_lock);
1157 1147          return (would_change(dd->dd_parent, delta, ancestor));
1158 1148  }
1159 1149  
1160 1150  typedef struct dsl_dir_rename_arg {
1161 1151          const char *ddra_oldname;
1162 1152          const char *ddra_newname;
1163 1153  } dsl_dir_rename_arg_t;
1164 1154  
1165 1155  /* ARGSUSED */
1166 1156  static int
1167 1157  dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
1168 1158  {
1169 1159          int *deltap = arg;
1170 1160          char namebuf[MAXNAMELEN];
1171 1161  
1172 1162          dsl_dataset_name(ds, namebuf);
1173 1163  
1174 1164          if (strlen(namebuf) + *deltap >= MAXNAMELEN)
1175 1165                  return (SET_ERROR(ENAMETOOLONG));
1176 1166          return (0);
1177 1167  }
1178 1168  
1179 1169  static int
1180 1170  dsl_dir_rename_check(void *arg, dmu_tx_t *tx)
1181 1171  {
1182 1172          dsl_dir_rename_arg_t *ddra = arg;
1183 1173          dsl_pool_t *dp = dmu_tx_pool(tx);
1184 1174          dsl_dir_t *dd, *newparent;
1185 1175          const char *mynewname;
1186 1176          int error;
1187 1177          int delta = strlen(ddra->ddra_newname) - strlen(ddra->ddra_oldname);
1188 1178  
1189 1179          /* target dir should exist */
1190 1180          error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL);
1191 1181          if (error != 0)
1192 1182                  return (error);
1193 1183  
1194 1184          /* new parent should exist */
1195 1185          error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG,
1196 1186              &newparent, &mynewname);
1197 1187          if (error != 0) {
1198 1188                  dsl_dir_rele(dd, FTAG);
1199 1189                  return (error);
1200 1190          }
1201 1191  
1202 1192          /* can't rename to different pool */
1203 1193          if (dd->dd_pool != newparent->dd_pool) {
1204 1194                  dsl_dir_rele(newparent, FTAG);
1205 1195                  dsl_dir_rele(dd, FTAG);
1206 1196                  return (SET_ERROR(ENXIO));
1207 1197          }
1208 1198  
1209 1199          /* new name should not already exist */
1210 1200          if (mynewname == NULL) {
1211 1201                  dsl_dir_rele(newparent, FTAG);
1212 1202                  dsl_dir_rele(dd, FTAG);
1213 1203                  return (SET_ERROR(EEXIST));
1214 1204          }
1215 1205  
1216 1206          /* if the name length is growing, validate child name lengths */
1217 1207          if (delta > 0) {
1218 1208                  error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename,
1219 1209                      &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
1220 1210                  if (error != 0) {
1221 1211                          dsl_dir_rele(newparent, FTAG);
1222 1212                          dsl_dir_rele(dd, FTAG);
1223 1213                          return (error);
1224 1214                  }
1225 1215          }
1226 1216  
1227 1217          if (newparent != dd->dd_parent) {
1228 1218                  /* is there enough space? */
1229 1219                  uint64_t myspace =
1230 1220                      MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved);
1231 1221  
1232 1222                  /* no rename into our descendant */
1233 1223                  if (closest_common_ancestor(dd, newparent) == dd) {
1234 1224                          dsl_dir_rele(newparent, FTAG);
1235 1225                          dsl_dir_rele(dd, FTAG);
1236 1226                          return (SET_ERROR(EINVAL));
1237 1227                  }
1238 1228  
1239 1229                  error = dsl_dir_transfer_possible(dd->dd_parent,
1240 1230                      newparent, myspace);
1241 1231                  if (error != 0) {
1242 1232                          dsl_dir_rele(newparent, FTAG);
1243 1233                          dsl_dir_rele(dd, FTAG);
1244 1234                          return (error);
1245 1235                  }
1246 1236          }
1247 1237  
1248 1238          dsl_dir_rele(newparent, FTAG);
1249 1239          dsl_dir_rele(dd, FTAG);
1250 1240          return (0);
1251 1241  }
1252 1242  
1253 1243  static void
1254 1244  dsl_dir_rename_sync(void *arg, dmu_tx_t *tx)
1255 1245  {
1256 1246          dsl_dir_rename_arg_t *ddra = arg;
1257 1247          dsl_pool_t *dp = dmu_tx_pool(tx);
1258 1248          dsl_dir_t *dd, *newparent;
1259 1249          const char *mynewname;
1260 1250          int error;
1261 1251          objset_t *mos = dp->dp_meta_objset;
1262 1252  
1263 1253          VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL));
1264 1254          VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent,
1265 1255              &mynewname));
1266 1256  
1267 1257          /* Log this before we change the name. */
1268 1258          spa_history_log_internal_dd(dd, "rename", tx,
1269 1259              "-> %s", ddra->ddra_newname);
1270 1260  
1271 1261          if (newparent != dd->dd_parent) {
1272 1262                  dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
1273 1263                      -dd->dd_phys->dd_used_bytes,
1274 1264                      -dd->dd_phys->dd_compressed_bytes,
1275 1265                      -dd->dd_phys->dd_uncompressed_bytes, tx);
1276 1266                  dsl_dir_diduse_space(newparent, DD_USED_CHILD,
1277 1267                      dd->dd_phys->dd_used_bytes,
1278 1268                      dd->dd_phys->dd_compressed_bytes,
1279 1269                      dd->dd_phys->dd_uncompressed_bytes, tx);
1280 1270  
1281 1271                  if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) {
1282 1272                          uint64_t unused_rsrv = dd->dd_phys->dd_reserved -
1283 1273                              dd->dd_phys->dd_used_bytes;
1284 1274  
1285 1275                          dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
1286 1276                              -unused_rsrv, 0, 0, tx);
1287 1277                          dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV,
1288 1278                              unused_rsrv, 0, 0, tx);
1289 1279                  }
1290 1280          }
1291 1281  
1292 1282          dmu_buf_will_dirty(dd->dd_dbuf, tx);
1293 1283  
1294 1284          /* remove from old parent zapobj */
1295 1285          error = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj,
1296 1286              dd->dd_myname, tx);
1297 1287          ASSERT0(error);
1298 1288  
1299 1289          (void) strcpy(dd->dd_myname, mynewname);
1300 1290          dsl_dir_rele(dd->dd_parent, dd);
1301 1291          dd->dd_phys->dd_parent_obj = newparent->dd_object;
1302 1292          VERIFY0(dsl_dir_hold_obj(dp,
1303 1293              newparent->dd_object, NULL, dd, &dd->dd_parent));
1304 1294  
1305 1295          /* add to new parent zapobj */
1306 1296          VERIFY0(zap_add(mos, newparent->dd_phys->dd_child_dir_zapobj,
1307 1297              dd->dd_myname, 8, 1, &dd->dd_object, tx));
1308 1298  
1309 1299          dsl_prop_notify_all(dd);
1310 1300  
1311 1301          dsl_dir_rele(newparent, FTAG);
1312 1302          dsl_dir_rele(dd, FTAG);
1313 1303  }
1314 1304  
1315 1305  int
1316 1306  dsl_dir_rename(const char *oldname, const char *newname)
1317 1307  {
1318 1308          dsl_dir_rename_arg_t ddra;
1319 1309  
1320 1310          ddra.ddra_oldname = oldname;
1321 1311          ddra.ddra_newname = newname;
1322 1312  
1323 1313          return (dsl_sync_task(oldname,
1324 1314              dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, 3));
1325 1315  }
1326 1316  
1327 1317  int
1328 1318  dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space)
1329 1319  {
1330 1320          dsl_dir_t *ancestor;
1331 1321          int64_t adelta;
1332 1322          uint64_t avail;
1333 1323  
1334 1324          ancestor = closest_common_ancestor(sdd, tdd);
1335 1325          adelta = would_change(sdd, -space, ancestor);
1336 1326          avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE);
1337 1327          if (avail < space)
1338 1328                  return (SET_ERROR(ENOSPC));
1339 1329  
1340 1330          return (0);
1341 1331  }
1342 1332  
1343 1333  timestruc_t
1344 1334  dsl_dir_snap_cmtime(dsl_dir_t *dd)
1345 1335  {
1346 1336          timestruc_t t;
1347 1337  
1348 1338          mutex_enter(&dd->dd_lock);
1349 1339          t = dd->dd_snap_cmtime;
1350 1340          mutex_exit(&dd->dd_lock);
1351 1341  
1352 1342          return (t);
1353 1343  }
1354 1344  
1355 1345  void
1356 1346  dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
1357 1347  {
1358 1348          timestruc_t t;
1359 1349  
1360 1350          gethrestime(&t);
1361 1351          mutex_enter(&dd->dd_lock);
1362 1352          dd->dd_snap_cmtime = t;
1363 1353          mutex_exit(&dd->dd_lock);
1364 1354  }
  
    | ↓ open down ↓ | 517 lines elided | ↑ open up ↑ | 
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX