Print this page
    
5981 Deadlock in dmu_objset_find_dp
    
      
        | Split | Close | 
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/dmu_objset.c
          +++ new/usr/src/uts/common/fs/zfs/dmu_objset.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  24   24   * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25   25   * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  26   26   * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  27   27   * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
  28   28   * Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
  29   29   */
  30   30  
  31   31  /* Portions Copyright 2010 Robert Milkowski */
  32   32  
  33   33  #include <sys/cred.h>
  34   34  #include <sys/zfs_context.h>
  35   35  #include <sys/dmu_objset.h>
  36   36  #include <sys/dsl_dir.h>
  37   37  #include <sys/dsl_dataset.h>
  38   38  #include <sys/dsl_prop.h>
  39   39  #include <sys/dsl_pool.h>
  40   40  #include <sys/dsl_synctask.h>
  41   41  #include <sys/dsl_deleg.h>
  42   42  #include <sys/dnode.h>
  43   43  #include <sys/dbuf.h>
  44   44  #include <sys/zvol.h>
  45   45  #include <sys/dmu_tx.h>
  46   46  #include <sys/zap.h>
  47   47  #include <sys/zil.h>
  48   48  #include <sys/dmu_impl.h>
  49   49  #include <sys/zfs_ioctl.h>
  50   50  #include <sys/sa.h>
  51   51  #include <sys/zfs_onexit.h>
  52   52  #include <sys/dsl_destroy.h>
  53   53  #include <sys/vdev.h>
  54   54  
  55   55  /*
  56   56   * Needed to close a window in dnode_move() that allows the objset to be freed
  57   57   * before it can be safely accessed.
  58   58   */
  59   59  krwlock_t os_lock;
  60   60  
  61   61  /*
  62   62   * Tunable to overwrite the maximum number of threads for the parallization
  63   63   * of dmu_objset_find_dp, needed to speed up the import of pools with many
  64   64   * datasets.
  65   65   * Default is 4 times the number of leaf vdevs.
  66   66   */
  67   67  int dmu_find_threads = 0;
  68   68  
  69   69  static void dmu_objset_find_dp_cb(void *arg);
  70   70  
  71   71  void
  72   72  dmu_objset_init(void)
  73   73  {
  74   74          rw_init(&os_lock, NULL, RW_DEFAULT, NULL);
  75   75  }
  76   76  
  77   77  void
  78   78  dmu_objset_fini(void)
  79   79  {
  80   80          rw_destroy(&os_lock);
  81   81  }
  82   82  
  83   83  spa_t *
  84   84  dmu_objset_spa(objset_t *os)
  85   85  {
  86   86          return (os->os_spa);
  87   87  }
  88   88  
  89   89  zilog_t *
  90   90  dmu_objset_zil(objset_t *os)
  91   91  {
  92   92          return (os->os_zil);
  93   93  }
  94   94  
  95   95  dsl_pool_t *
  96   96  dmu_objset_pool(objset_t *os)
  97   97  {
  98   98          dsl_dataset_t *ds;
  99   99  
 100  100          if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir)
 101  101                  return (ds->ds_dir->dd_pool);
 102  102          else
 103  103                  return (spa_get_dsl(os->os_spa));
 104  104  }
 105  105  
 106  106  dsl_dataset_t *
 107  107  dmu_objset_ds(objset_t *os)
 108  108  {
 109  109          return (os->os_dsl_dataset);
 110  110  }
 111  111  
 112  112  dmu_objset_type_t
 113  113  dmu_objset_type(objset_t *os)
 114  114  {
 115  115          return (os->os_phys->os_type);
 116  116  }
 117  117  
 118  118  void
 119  119  dmu_objset_name(objset_t *os, char *buf)
 120  120  {
 121  121          dsl_dataset_name(os->os_dsl_dataset, buf);
 122  122  }
 123  123  
 124  124  uint64_t
 125  125  dmu_objset_id(objset_t *os)
 126  126  {
 127  127          dsl_dataset_t *ds = os->os_dsl_dataset;
 128  128  
 129  129          return (ds ? ds->ds_object : 0);
 130  130  }
 131  131  
 132  132  zfs_sync_type_t
 133  133  dmu_objset_syncprop(objset_t *os)
 134  134  {
 135  135          return (os->os_sync);
 136  136  }
 137  137  
 138  138  zfs_logbias_op_t
 139  139  dmu_objset_logbias(objset_t *os)
 140  140  {
 141  141          return (os->os_logbias);
 142  142  }
 143  143  
 144  144  static void
 145  145  checksum_changed_cb(void *arg, uint64_t newval)
 146  146  {
 147  147          objset_t *os = arg;
 148  148  
 149  149          /*
 150  150           * Inheritance should have been done by now.
 151  151           */
 152  152          ASSERT(newval != ZIO_CHECKSUM_INHERIT);
 153  153  
 154  154          os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
 155  155  }
 156  156  
 157  157  static void
 158  158  compression_changed_cb(void *arg, uint64_t newval)
 159  159  {
 160  160          objset_t *os = arg;
 161  161  
 162  162          /*
 163  163           * Inheritance and range checking should have been done by now.
 164  164           */
 165  165          ASSERT(newval != ZIO_COMPRESS_INHERIT);
 166  166  
 167  167          os->os_compress = zio_compress_select(os->os_spa, newval,
 168  168              ZIO_COMPRESS_ON);
 169  169  }
 170  170  
 171  171  static void
 172  172  copies_changed_cb(void *arg, uint64_t newval)
 173  173  {
 174  174          objset_t *os = arg;
 175  175  
 176  176          /*
 177  177           * Inheritance and range checking should have been done by now.
 178  178           */
 179  179          ASSERT(newval > 0);
 180  180          ASSERT(newval <= spa_max_replication(os->os_spa));
 181  181  
 182  182          os->os_copies = newval;
 183  183  }
 184  184  
 185  185  static void
 186  186  dedup_changed_cb(void *arg, uint64_t newval)
 187  187  {
 188  188          objset_t *os = arg;
 189  189          spa_t *spa = os->os_spa;
 190  190          enum zio_checksum checksum;
 191  191  
 192  192          /*
 193  193           * Inheritance should have been done by now.
 194  194           */
 195  195          ASSERT(newval != ZIO_CHECKSUM_INHERIT);
 196  196  
 197  197          checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF);
 198  198  
 199  199          os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK;
 200  200          os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY);
 201  201  }
 202  202  
 203  203  static void
 204  204  primary_cache_changed_cb(void *arg, uint64_t newval)
 205  205  {
 206  206          objset_t *os = arg;
 207  207  
 208  208          /*
 209  209           * Inheritance and range checking should have been done by now.
 210  210           */
 211  211          ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
 212  212              newval == ZFS_CACHE_METADATA);
 213  213  
 214  214          os->os_primary_cache = newval;
 215  215  }
 216  216  
 217  217  static void
 218  218  secondary_cache_changed_cb(void *arg, uint64_t newval)
 219  219  {
 220  220          objset_t *os = arg;
 221  221  
 222  222          /*
 223  223           * Inheritance and range checking should have been done by now.
 224  224           */
 225  225          ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
 226  226              newval == ZFS_CACHE_METADATA);
 227  227  
 228  228          os->os_secondary_cache = newval;
 229  229  }
 230  230  
 231  231  static void
 232  232  sync_changed_cb(void *arg, uint64_t newval)
 233  233  {
 234  234          objset_t *os = arg;
 235  235  
 236  236          /*
 237  237           * Inheritance and range checking should have been done by now.
 238  238           */
 239  239          ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS ||
 240  240              newval == ZFS_SYNC_DISABLED);
 241  241  
 242  242          os->os_sync = newval;
 243  243          if (os->os_zil)
 244  244                  zil_set_sync(os->os_zil, newval);
 245  245  }
 246  246  
 247  247  static void
 248  248  redundant_metadata_changed_cb(void *arg, uint64_t newval)
 249  249  {
 250  250          objset_t *os = arg;
 251  251  
 252  252          /*
 253  253           * Inheritance and range checking should have been done by now.
 254  254           */
 255  255          ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL ||
 256  256              newval == ZFS_REDUNDANT_METADATA_MOST);
 257  257  
 258  258          os->os_redundant_metadata = newval;
 259  259  }
 260  260  
 261  261  static void
 262  262  logbias_changed_cb(void *arg, uint64_t newval)
 263  263  {
 264  264          objset_t *os = arg;
 265  265  
 266  266          ASSERT(newval == ZFS_LOGBIAS_LATENCY ||
 267  267              newval == ZFS_LOGBIAS_THROUGHPUT);
 268  268          os->os_logbias = newval;
 269  269          if (os->os_zil)
 270  270                  zil_set_logbias(os->os_zil, newval);
 271  271  }
 272  272  
 273  273  static void
 274  274  recordsize_changed_cb(void *arg, uint64_t newval)
 275  275  {
 276  276          objset_t *os = arg;
 277  277  
 278  278          os->os_recordsize = newval;
 279  279  }
 280  280  
 281  281  void
 282  282  dmu_objset_byteswap(void *buf, size_t size)
 283  283  {
 284  284          objset_phys_t *osp = buf;
 285  285  
 286  286          ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t));
 287  287          dnode_byteswap(&osp->os_meta_dnode);
 288  288          byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
 289  289          osp->os_type = BSWAP_64(osp->os_type);
 290  290          osp->os_flags = BSWAP_64(osp->os_flags);
 291  291          if (size == sizeof (objset_phys_t)) {
 292  292                  dnode_byteswap(&osp->os_userused_dnode);
 293  293                  dnode_byteswap(&osp->os_groupused_dnode);
 294  294          }
 295  295  }
 296  296  
 297  297  int
 298  298  dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 299  299      objset_t **osp)
 300  300  {
 301  301          objset_t *os;
 302  302          int i, err;
 303  303  
 304  304          ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
 305  305  
 306  306          os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
 307  307          os->os_dsl_dataset = ds;
 308  308          os->os_spa = spa;
 309  309          os->os_rootbp = bp;
 310  310          if (!BP_IS_HOLE(os->os_rootbp)) {
 311  311                  arc_flags_t aflags = ARC_FLAG_WAIT;
 312  312                  zbookmark_phys_t zb;
 313  313                  SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
 314  314                      ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 315  315  
 316  316                  if (DMU_OS_IS_L2CACHEABLE(os))
 317  317                          aflags |= ARC_FLAG_L2CACHE;
 318  318                  if (DMU_OS_IS_L2COMPRESSIBLE(os))
 319  319                          aflags |= ARC_FLAG_L2COMPRESS;
 320  320  
 321  321                  dprintf_bp(os->os_rootbp, "reading %s", "");
 322  322                  err = arc_read(NULL, spa, os->os_rootbp,
 323  323                      arc_getbuf_func, &os->os_phys_buf,
 324  324                      ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
 325  325                  if (err != 0) {
 326  326                          kmem_free(os, sizeof (objset_t));
 327  327                          /* convert checksum errors into IO errors */
 328  328                          if (err == ECKSUM)
 329  329                                  err = SET_ERROR(EIO);
 330  330                          return (err);
 331  331                  }
 332  332  
 333  333                  /* Increase the blocksize if we are permitted. */
 334  334                  if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
 335  335                      arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) {
 336  336                          arc_buf_t *buf = arc_buf_alloc(spa,
 337  337                              sizeof (objset_phys_t), &os->os_phys_buf,
 338  338                              ARC_BUFC_METADATA);
 339  339                          bzero(buf->b_data, sizeof (objset_phys_t));
 340  340                          bcopy(os->os_phys_buf->b_data, buf->b_data,
 341  341                              arc_buf_size(os->os_phys_buf));
 342  342                          (void) arc_buf_remove_ref(os->os_phys_buf,
 343  343                              &os->os_phys_buf);
 344  344                          os->os_phys_buf = buf;
 345  345                  }
 346  346  
 347  347                  os->os_phys = os->os_phys_buf->b_data;
 348  348                  os->os_flags = os->os_phys->os_flags;
 349  349          } else {
 350  350                  int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
 351  351                      sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
 352  352                  os->os_phys_buf = arc_buf_alloc(spa, size,
 353  353                      &os->os_phys_buf, ARC_BUFC_METADATA);
 354  354                  os->os_phys = os->os_phys_buf->b_data;
 355  355                  bzero(os->os_phys, size);
 356  356          }
 357  357  
 358  358          /*
 359  359           * Note: the changed_cb will be called once before the register
 360  360           * func returns, thus changing the checksum/compression from the
 361  361           * default (fletcher2/off).  Snapshots don't need to know about
 362  362           * checksum/compression/copies.
 363  363           */
 364  364          if (ds != NULL) {
 365  365                  err = dsl_prop_register(ds,
 366  366                      zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
 367  367                      primary_cache_changed_cb, os);
 368  368                  if (err == 0) {
 369  369                          err = dsl_prop_register(ds,
 370  370                              zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
 371  371                              secondary_cache_changed_cb, os);
 372  372                  }
 373  373                  if (!ds->ds_is_snapshot) {
 374  374                          if (err == 0) {
 375  375                                  err = dsl_prop_register(ds,
 376  376                                      zfs_prop_to_name(ZFS_PROP_CHECKSUM),
 377  377                                      checksum_changed_cb, os);
 378  378                          }
 379  379                          if (err == 0) {
 380  380                                  err = dsl_prop_register(ds,
 381  381                                      zfs_prop_to_name(ZFS_PROP_COMPRESSION),
 382  382                                      compression_changed_cb, os);
 383  383                          }
 384  384                          if (err == 0) {
 385  385                                  err = dsl_prop_register(ds,
 386  386                                      zfs_prop_to_name(ZFS_PROP_COPIES),
 387  387                                      copies_changed_cb, os);
 388  388                          }
 389  389                          if (err == 0) {
 390  390                                  err = dsl_prop_register(ds,
 391  391                                      zfs_prop_to_name(ZFS_PROP_DEDUP),
 392  392                                      dedup_changed_cb, os);
 393  393                          }
 394  394                          if (err == 0) {
 395  395                                  err = dsl_prop_register(ds,
 396  396                                      zfs_prop_to_name(ZFS_PROP_LOGBIAS),
 397  397                                      logbias_changed_cb, os);
 398  398                          }
 399  399                          if (err == 0) {
 400  400                                  err = dsl_prop_register(ds,
 401  401                                      zfs_prop_to_name(ZFS_PROP_SYNC),
 402  402                                      sync_changed_cb, os);
 403  403                          }
 404  404                          if (err == 0) {
 405  405                                  err = dsl_prop_register(ds,
 406  406                                      zfs_prop_to_name(
 407  407                                      ZFS_PROP_REDUNDANT_METADATA),
 408  408                                      redundant_metadata_changed_cb, os);
 409  409                          }
 410  410                          if (err == 0) {
 411  411                                  err = dsl_prop_register(ds,
 412  412                                      zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
 413  413                                      recordsize_changed_cb, os);
 414  414                          }
 415  415                  }
 416  416                  if (err != 0) {
 417  417                          VERIFY(arc_buf_remove_ref(os->os_phys_buf,
 418  418                              &os->os_phys_buf));
 419  419                          kmem_free(os, sizeof (objset_t));
 420  420                          return (err);
 421  421                  }
 422  422          } else {
 423  423                  /* It's the meta-objset. */
 424  424                  os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
 425  425                  os->os_compress = ZIO_COMPRESS_ON;
 426  426                  os->os_copies = spa_max_replication(spa);
 427  427                  os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
 428  428                  os->os_dedup_verify = B_FALSE;
 429  429                  os->os_logbias = ZFS_LOGBIAS_LATENCY;
 430  430                  os->os_sync = ZFS_SYNC_STANDARD;
 431  431                  os->os_primary_cache = ZFS_CACHE_ALL;
 432  432                  os->os_secondary_cache = ZFS_CACHE_ALL;
 433  433          }
 434  434  
 435  435          if (ds == NULL || !ds->ds_is_snapshot)
 436  436                  os->os_zil_header = os->os_phys->os_zil_header;
 437  437          os->os_zil = zil_alloc(os, &os->os_zil_header);
 438  438  
 439  439          for (i = 0; i < TXG_SIZE; i++) {
 440  440                  list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t),
 441  441                      offsetof(dnode_t, dn_dirty_link[i]));
 442  442                  list_create(&os->os_free_dnodes[i], sizeof (dnode_t),
 443  443                      offsetof(dnode_t, dn_dirty_link[i]));
 444  444          }
 445  445          list_create(&os->os_dnodes, sizeof (dnode_t),
 446  446              offsetof(dnode_t, dn_link));
 447  447          list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
 448  448              offsetof(dmu_buf_impl_t, db_link));
 449  449  
 450  450          mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
 451  451          mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
 452  452          mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
 453  453  
 454  454          dnode_special_open(os, &os->os_phys->os_meta_dnode,
 455  455              DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
 456  456          if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
 457  457                  dnode_special_open(os, &os->os_phys->os_userused_dnode,
 458  458                      DMU_USERUSED_OBJECT, &os->os_userused_dnode);
 459  459                  dnode_special_open(os, &os->os_phys->os_groupused_dnode,
 460  460                      DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode);
 461  461          }
 462  462  
 463  463          *osp = os;
 464  464          return (0);
 465  465  }
 466  466  
 467  467  int
 468  468  dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
 469  469  {
 470  470          int err = 0;
 471  471  
 472  472          mutex_enter(&ds->ds_opening_lock);
 473  473          if (ds->ds_objset == NULL) {
 474  474                  objset_t *os;
 475  475                  err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
 476  476                      ds, dsl_dataset_get_blkptr(ds), &os);
 477  477  
 478  478                  if (err == 0) {
 479  479                          mutex_enter(&ds->ds_lock);
 480  480                          ASSERT(ds->ds_objset == NULL);
 481  481                          ds->ds_objset = os;
 482  482                          mutex_exit(&ds->ds_lock);
 483  483                  }
 484  484          }
 485  485          *osp = ds->ds_objset;
 486  486          mutex_exit(&ds->ds_opening_lock);
 487  487          return (err);
 488  488  }
 489  489  
 490  490  /*
 491  491   * Holds the pool while the objset is held.  Therefore only one objset
 492  492   * can be held at a time.
 493  493   */
 494  494  int
 495  495  dmu_objset_hold(const char *name, void *tag, objset_t **osp)
 496  496  {
 497  497          dsl_pool_t *dp;
 498  498          dsl_dataset_t *ds;
 499  499          int err;
 500  500  
 501  501          err = dsl_pool_hold(name, tag, &dp);
 502  502          if (err != 0)
 503  503                  return (err);
 504  504          err = dsl_dataset_hold(dp, name, tag, &ds);
 505  505          if (err != 0) {
 506  506                  dsl_pool_rele(dp, tag);
 507  507                  return (err);
 508  508          }
 509  509  
 510  510          err = dmu_objset_from_ds(ds, osp);
 511  511          if (err != 0) {
 512  512                  dsl_dataset_rele(ds, tag);
 513  513                  dsl_pool_rele(dp, tag);
 514  514          }
 515  515  
 516  516          return (err);
 517  517  }
 518  518  
 519  519  static int
 520  520  dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
 521  521      boolean_t readonly, void *tag, objset_t **osp)
 522  522  {
 523  523          int err;
 524  524  
 525  525          err = dmu_objset_from_ds(ds, osp);
 526  526          if (err != 0) {
 527  527                  dsl_dataset_disown(ds, tag);
 528  528          } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
 529  529                  dsl_dataset_disown(ds, tag);
 530  530                  return (SET_ERROR(EINVAL));
 531  531          } else if (!readonly && dsl_dataset_is_snapshot(ds)) {
 532  532                  dsl_dataset_disown(ds, tag);
 533  533                  return (SET_ERROR(EROFS));
 534  534          }
 535  535          return (err);
 536  536  }
 537  537  
 538  538  /*
 539  539   * dsl_pool must not be held when this is called.
 540  540   * Upon successful return, there will be a longhold on the dataset,
 541  541   * and the dsl_pool will not be held.
 542  542   */
 543  543  int
 544  544  dmu_objset_own(const char *name, dmu_objset_type_t type,
 545  545      boolean_t readonly, void *tag, objset_t **osp)
 546  546  {
 547  547          dsl_pool_t *dp;
 548  548          dsl_dataset_t *ds;
 549  549          int err;
 550  550  
 551  551          err = dsl_pool_hold(name, FTAG, &dp);
 552  552          if (err != 0)
 553  553                  return (err);
 554  554          err = dsl_dataset_own(dp, name, tag, &ds);
 555  555          if (err != 0) {
 556  556                  dsl_pool_rele(dp, FTAG);
 557  557                  return (err);
 558  558          }
 559  559          err = dmu_objset_own_impl(ds, type, readonly, tag, osp);
 560  560          dsl_pool_rele(dp, FTAG);
 561  561  
 562  562          return (err);
 563  563  }
 564  564  
 565  565  int
 566  566  dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
 567  567      boolean_t readonly, void *tag, objset_t **osp)
 568  568  {
 569  569          dsl_dataset_t *ds;
 570  570          int err;
 571  571  
 572  572          err = dsl_dataset_own_obj(dp, obj, tag, &ds);
 573  573          if (err != 0)
 574  574                  return (err);
 575  575  
 576  576          return (dmu_objset_own_impl(ds, type, readonly, tag, osp));
 577  577  }
 578  578  
 579  579  void
 580  580  dmu_objset_rele(objset_t *os, void *tag)
 581  581  {
 582  582          dsl_pool_t *dp = dmu_objset_pool(os);
 583  583          dsl_dataset_rele(os->os_dsl_dataset, tag);
 584  584          dsl_pool_rele(dp, tag);
 585  585  }
 586  586  
 587  587  /*
 588  588   * When we are called, os MUST refer to an objset associated with a dataset
 589  589   * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner
 590  590   * == tag.  We will then release and reacquire ownership of the dataset while
 591  591   * holding the pool config_rwlock to avoid intervening namespace or ownership
 592  592   * changes may occur.
 593  593   *
 594  594   * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to
 595  595   * release the hold on its dataset and acquire a new one on the dataset of the
 596  596   * same name so that it can be partially torn down and reconstructed.
 597  597   */
 598  598  void
 599  599  dmu_objset_refresh_ownership(objset_t *os, void *tag)
 600  600  {
 601  601          dsl_pool_t *dp;
 602  602          dsl_dataset_t *ds, *newds;
 603  603          char name[MAXNAMELEN];
 604  604  
 605  605          ds = os->os_dsl_dataset;
 606  606          VERIFY3P(ds, !=, NULL);
 607  607          VERIFY3P(ds->ds_owner, ==, tag);
 608  608          VERIFY(dsl_dataset_long_held(ds));
 609  609  
 610  610          dsl_dataset_name(ds, name);
 611  611          dp = dmu_objset_pool(os);
 612  612          dsl_pool_config_enter(dp, FTAG);
 613  613          dmu_objset_disown(os, tag);
 614  614          VERIFY0(dsl_dataset_own(dp, name, tag, &newds));
 615  615          VERIFY3P(newds, ==, os->os_dsl_dataset);
 616  616          dsl_pool_config_exit(dp, FTAG);
 617  617  }
 618  618  
 619  619  void
 620  620  dmu_objset_disown(objset_t *os, void *tag)
 621  621  {
 622  622          dsl_dataset_disown(os->os_dsl_dataset, tag);
 623  623  }
 624  624  
 625  625  void
 626  626  dmu_objset_evict_dbufs(objset_t *os)
 627  627  {
 628  628          dnode_t dn_marker;
 629  629          dnode_t *dn;
 630  630  
 631  631          mutex_enter(&os->os_lock);
 632  632          dn = list_head(&os->os_dnodes);
 633  633          while (dn != NULL) {
 634  634                  /*
 635  635                   * Skip dnodes without holds.  We have to do this dance
 636  636                   * because dnode_add_ref() only works if there is already a
 637  637                   * hold.  If the dnode has no holds, then it has no dbufs.
 638  638                   */
 639  639                  if (dnode_add_ref(dn, FTAG)) {
 640  640                          list_insert_after(&os->os_dnodes, dn, &dn_marker);
 641  641                          mutex_exit(&os->os_lock);
 642  642  
 643  643                          dnode_evict_dbufs(dn);
 644  644                          dnode_rele(dn, FTAG);
 645  645  
 646  646                          mutex_enter(&os->os_lock);
 647  647                          dn = list_next(&os->os_dnodes, &dn_marker);
 648  648                          list_remove(&os->os_dnodes, &dn_marker);
 649  649                  } else {
 650  650                          dn = list_next(&os->os_dnodes, dn);
 651  651                  }
 652  652          }
 653  653          mutex_exit(&os->os_lock);
 654  654  
 655  655          if (DMU_USERUSED_DNODE(os) != NULL) {
 656  656                  dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os));
 657  657                  dnode_evict_dbufs(DMU_USERUSED_DNODE(os));
 658  658          }
 659  659          dnode_evict_dbufs(DMU_META_DNODE(os));
 660  660  }
 661  661  
 662  662  /*
 663  663   * Objset eviction processing is split into into two pieces.
 664  664   * The first marks the objset as evicting, evicts any dbufs that
 665  665   * have a refcount of zero, and then queues up the objset for the
 666  666   * second phase of eviction.  Once os->os_dnodes has been cleared by
 667  667   * dnode_buf_pageout()->dnode_destroy(), the second phase is executed.
 668  668   * The second phase closes the special dnodes, dequeues the objset from
 669  669   * the list of those undergoing eviction, and finally frees the objset.
 670  670   *
 671  671   * NOTE: Due to asynchronous eviction processing (invocation of
 672  672   *       dnode_buf_pageout()), it is possible for the meta dnode for the
 673  673   *       objset to have no holds even though os->os_dnodes is not empty.
 674  674   */
 675  675  void
 676  676  dmu_objset_evict(objset_t *os)
 677  677  {
 678  678          dsl_dataset_t *ds = os->os_dsl_dataset;
 679  679  
 680  680          for (int t = 0; t < TXG_SIZE; t++)
 681  681                  ASSERT(!dmu_objset_is_dirty(os, t));
 682  682  
 683  683          if (ds) {
 684  684                  if (!ds->ds_is_snapshot) {
 685  685                          VERIFY0(dsl_prop_unregister(ds,
 686  686                              zfs_prop_to_name(ZFS_PROP_CHECKSUM),
 687  687                              checksum_changed_cb, os));
 688  688                          VERIFY0(dsl_prop_unregister(ds,
 689  689                              zfs_prop_to_name(ZFS_PROP_COMPRESSION),
 690  690                              compression_changed_cb, os));
 691  691                          VERIFY0(dsl_prop_unregister(ds,
 692  692                              zfs_prop_to_name(ZFS_PROP_COPIES),
 693  693                              copies_changed_cb, os));
 694  694                          VERIFY0(dsl_prop_unregister(ds,
 695  695                              zfs_prop_to_name(ZFS_PROP_DEDUP),
 696  696                              dedup_changed_cb, os));
 697  697                          VERIFY0(dsl_prop_unregister(ds,
 698  698                              zfs_prop_to_name(ZFS_PROP_LOGBIAS),
 699  699                              logbias_changed_cb, os));
 700  700                          VERIFY0(dsl_prop_unregister(ds,
 701  701                              zfs_prop_to_name(ZFS_PROP_SYNC),
 702  702                              sync_changed_cb, os));
 703  703                          VERIFY0(dsl_prop_unregister(ds,
 704  704                              zfs_prop_to_name(ZFS_PROP_REDUNDANT_METADATA),
 705  705                              redundant_metadata_changed_cb, os));
 706  706                          VERIFY0(dsl_prop_unregister(ds,
 707  707                              zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
 708  708                              recordsize_changed_cb, os));
 709  709                  }
 710  710                  VERIFY0(dsl_prop_unregister(ds,
 711  711                      zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
 712  712                      primary_cache_changed_cb, os));
 713  713                  VERIFY0(dsl_prop_unregister(ds,
 714  714                      zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
 715  715                      secondary_cache_changed_cb, os));
 716  716          }
 717  717  
 718  718          if (os->os_sa)
 719  719                  sa_tear_down(os);
 720  720  
 721  721          os->os_evicting = B_TRUE;
 722  722          dmu_objset_evict_dbufs(os);
 723  723  
 724  724          mutex_enter(&os->os_lock);
 725  725          spa_evicting_os_register(os->os_spa, os);
 726  726          if (list_is_empty(&os->os_dnodes)) {
 727  727                  mutex_exit(&os->os_lock);
 728  728                  dmu_objset_evict_done(os);
 729  729          } else {
 730  730                  mutex_exit(&os->os_lock);
 731  731          }
 732  732  }
 733  733  
 734  734  void
 735  735  dmu_objset_evict_done(objset_t *os)
 736  736  {
 737  737          ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
 738  738  
 739  739          dnode_special_close(&os->os_meta_dnode);
 740  740          if (DMU_USERUSED_DNODE(os)) {
 741  741                  dnode_special_close(&os->os_userused_dnode);
 742  742                  dnode_special_close(&os->os_groupused_dnode);
 743  743          }
 744  744          zil_free(os->os_zil);
 745  745  
 746  746          VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf));
 747  747  
 748  748          /*
 749  749           * This is a barrier to prevent the objset from going away in
 750  750           * dnode_move() until we can safely ensure that the objset is still in
 751  751           * use. We consider the objset valid before the barrier and invalid
 752  752           * after the barrier.
 753  753           */
 754  754          rw_enter(&os_lock, RW_READER);
 755  755          rw_exit(&os_lock);
 756  756  
 757  757          mutex_destroy(&os->os_lock);
 758  758          mutex_destroy(&os->os_obj_lock);
 759  759          mutex_destroy(&os->os_user_ptr_lock);
 760  760          spa_evicting_os_deregister(os->os_spa, os);
 761  761          kmem_free(os, sizeof (objset_t));
 762  762  }
 763  763  
 764  764  timestruc_t
 765  765  dmu_objset_snap_cmtime(objset_t *os)
 766  766  {
 767  767          return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
 768  768  }
 769  769  
 770  770  /* called from dsl for meta-objset */
 771  771  objset_t *
 772  772  dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 773  773      dmu_objset_type_t type, dmu_tx_t *tx)
 774  774  {
 775  775          objset_t *os;
 776  776          dnode_t *mdn;
 777  777  
 778  778          ASSERT(dmu_tx_is_syncing(tx));
 779  779  
 780  780          if (ds != NULL)
 781  781                  VERIFY0(dmu_objset_from_ds(ds, &os));
 782  782          else
 783  783                  VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os));
 784  784  
 785  785          mdn = DMU_META_DNODE(os);
 786  786  
 787  787          dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
 788  788              DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
 789  789  
 790  790          /*
 791  791           * We don't want to have to increase the meta-dnode's nlevels
 792  792           * later, because then we could do it in quescing context while
 793  793           * we are also accessing it in open context.
 794  794           *
 795  795           * This precaution is not necessary for the MOS (ds == NULL),
 796  796           * because the MOS is only updated in syncing context.
 797  797           * This is most fortunate: the MOS is the only objset that
 798  798           * needs to be synced multiple times as spa_sync() iterates
 799  799           * to convergence, so minimizing its dn_nlevels matters.
 800  800           */
 801  801          if (ds != NULL) {
 802  802                  int levels = 1;
 803  803  
 804  804                  /*
 805  805                   * Determine the number of levels necessary for the meta-dnode
 806  806                   * to contain DN_MAX_OBJECT dnodes.
 807  807                   */
 808  808                  while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift +
 809  809                      (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
 810  810                      DN_MAX_OBJECT * sizeof (dnode_phys_t))
 811  811                          levels++;
 812  812  
 813  813                  mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
 814  814                      mdn->dn_nlevels = levels;
 815  815          }
 816  816  
 817  817          ASSERT(type != DMU_OST_NONE);
 818  818          ASSERT(type != DMU_OST_ANY);
 819  819          ASSERT(type < DMU_OST_NUMTYPES);
 820  820          os->os_phys->os_type = type;
 821  821          if (dmu_objset_userused_enabled(os)) {
 822  822                  os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
 823  823                  os->os_flags = os->os_phys->os_flags;
 824  824          }
 825  825  
 826  826          dsl_dataset_dirty(ds, tx);
 827  827  
 828  828          return (os);
 829  829  }
 830  830  
 831  831  typedef struct dmu_objset_create_arg {
 832  832          const char *doca_name;
 833  833          cred_t *doca_cred;
 834  834          void (*doca_userfunc)(objset_t *os, void *arg,
 835  835              cred_t *cr, dmu_tx_t *tx);
 836  836          void *doca_userarg;
 837  837          dmu_objset_type_t doca_type;
 838  838          uint64_t doca_flags;
 839  839  } dmu_objset_create_arg_t;
 840  840  
 841  841  /*ARGSUSED*/
 842  842  static int
 843  843  dmu_objset_create_check(void *arg, dmu_tx_t *tx)
 844  844  {
 845  845          dmu_objset_create_arg_t *doca = arg;
 846  846          dsl_pool_t *dp = dmu_tx_pool(tx);
 847  847          dsl_dir_t *pdd;
 848  848          const char *tail;
 849  849          int error;
 850  850  
 851  851          if (strchr(doca->doca_name, '@') != NULL)
 852  852                  return (SET_ERROR(EINVAL));
 853  853  
 854  854          error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail);
 855  855          if (error != 0)
 856  856                  return (error);
 857  857          if (tail == NULL) {
 858  858                  dsl_dir_rele(pdd, FTAG);
 859  859                  return (SET_ERROR(EEXIST));
 860  860          }
 861  861          error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
 862  862              doca->doca_cred);
 863  863          dsl_dir_rele(pdd, FTAG);
 864  864  
 865  865          return (error);
 866  866  }
 867  867  
 868  868  static void
 869  869  dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
 870  870  {
 871  871          dmu_objset_create_arg_t *doca = arg;
 872  872          dsl_pool_t *dp = dmu_tx_pool(tx);
 873  873          dsl_dir_t *pdd;
 874  874          const char *tail;
 875  875          dsl_dataset_t *ds;
 876  876          uint64_t obj;
 877  877          blkptr_t *bp;
 878  878          objset_t *os;
 879  879  
 880  880          VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail));
 881  881  
 882  882          obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags,
 883  883              doca->doca_cred, tx);
 884  884  
 885  885          VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
 886  886          bp = dsl_dataset_get_blkptr(ds);
 887  887          os = dmu_objset_create_impl(pdd->dd_pool->dp_spa,
 888  888              ds, bp, doca->doca_type, tx);
 889  889  
 890  890          if (doca->doca_userfunc != NULL) {
 891  891                  doca->doca_userfunc(os, doca->doca_userarg,
 892  892                      doca->doca_cred, tx);
 893  893          }
 894  894  
 895  895          spa_history_log_internal_ds(ds, "create", tx, "");
 896  896          dsl_dataset_rele(ds, FTAG);
 897  897          dsl_dir_rele(pdd, FTAG);
 898  898  }
 899  899  
 900  900  int
 901  901  dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
 902  902      void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg)
 903  903  {
 904  904          dmu_objset_create_arg_t doca;
 905  905  
 906  906          doca.doca_name = name;
 907  907          doca.doca_cred = CRED();
 908  908          doca.doca_flags = flags;
 909  909          doca.doca_userfunc = func;
 910  910          doca.doca_userarg = arg;
 911  911          doca.doca_type = type;
 912  912  
 913  913          return (dsl_sync_task(name,
 914  914              dmu_objset_create_check, dmu_objset_create_sync, &doca,
 915  915              5, ZFS_SPACE_CHECK_NORMAL));
 916  916  }
 917  917  
 918  918  typedef struct dmu_objset_clone_arg {
 919  919          const char *doca_clone;
 920  920          const char *doca_origin;
 921  921          cred_t *doca_cred;
 922  922  } dmu_objset_clone_arg_t;
 923  923  
 924  924  /*ARGSUSED*/
 925  925  static int
 926  926  dmu_objset_clone_check(void *arg, dmu_tx_t *tx)
 927  927  {
 928  928          dmu_objset_clone_arg_t *doca = arg;
 929  929          dsl_dir_t *pdd;
 930  930          const char *tail;
 931  931          int error;
 932  932          dsl_dataset_t *origin;
 933  933          dsl_pool_t *dp = dmu_tx_pool(tx);
 934  934  
 935  935          if (strchr(doca->doca_clone, '@') != NULL)
 936  936                  return (SET_ERROR(EINVAL));
 937  937  
 938  938          error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail);
 939  939          if (error != 0)
 940  940                  return (error);
 941  941          if (tail == NULL) {
 942  942                  dsl_dir_rele(pdd, FTAG);
 943  943                  return (SET_ERROR(EEXIST));
 944  944          }
 945  945  
 946  946          error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
 947  947              doca->doca_cred);
 948  948          if (error != 0) {
 949  949                  dsl_dir_rele(pdd, FTAG);
 950  950                  return (SET_ERROR(EDQUOT));
 951  951          }
 952  952          dsl_dir_rele(pdd, FTAG);
 953  953  
 954  954          error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin);
 955  955          if (error != 0)
 956  956                  return (error);
 957  957  
 958  958          /* You can only clone snapshots, not the head datasets. */
 959  959          if (!origin->ds_is_snapshot) {
 960  960                  dsl_dataset_rele(origin, FTAG);
 961  961                  return (SET_ERROR(EINVAL));
 962  962          }
 963  963          dsl_dataset_rele(origin, FTAG);
 964  964  
 965  965          return (0);
 966  966  }
 967  967  
 968  968  static void
 969  969  dmu_objset_clone_sync(void *arg, dmu_tx_t *tx)
 970  970  {
 971  971          dmu_objset_clone_arg_t *doca = arg;
 972  972          dsl_pool_t *dp = dmu_tx_pool(tx);
 973  973          dsl_dir_t *pdd;
 974  974          const char *tail;
 975  975          dsl_dataset_t *origin, *ds;
 976  976          uint64_t obj;
 977  977          char namebuf[MAXNAMELEN];
 978  978  
 979  979          VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail));
 980  980          VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin));
 981  981  
 982  982          obj = dsl_dataset_create_sync(pdd, tail, origin, 0,
 983  983              doca->doca_cred, tx);
 984  984  
 985  985          VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
 986  986          dsl_dataset_name(origin, namebuf);
 987  987          spa_history_log_internal_ds(ds, "clone", tx,
 988  988              "origin=%s (%llu)", namebuf, origin->ds_object);
 989  989          dsl_dataset_rele(ds, FTAG);
 990  990          dsl_dataset_rele(origin, FTAG);
 991  991          dsl_dir_rele(pdd, FTAG);
 992  992  }
 993  993  
 994  994  int
 995  995  dmu_objset_clone(const char *clone, const char *origin)
 996  996  {
 997  997          dmu_objset_clone_arg_t doca;
 998  998  
 999  999          doca.doca_clone = clone;
1000 1000          doca.doca_origin = origin;
1001 1001          doca.doca_cred = CRED();
1002 1002  
1003 1003          return (dsl_sync_task(clone,
1004 1004              dmu_objset_clone_check, dmu_objset_clone_sync, &doca,
1005 1005              5, ZFS_SPACE_CHECK_NORMAL));
1006 1006  }
1007 1007  
1008 1008  int
1009 1009  dmu_objset_snapshot_one(const char *fsname, const char *snapname)
1010 1010  {
1011 1011          int err;
1012 1012          char *longsnap = kmem_asprintf("%s@%s", fsname, snapname);
1013 1013          nvlist_t *snaps = fnvlist_alloc();
1014 1014  
1015 1015          fnvlist_add_boolean(snaps, longsnap);
1016 1016          strfree(longsnap);
1017 1017          err = dsl_dataset_snapshot(snaps, NULL, NULL);
1018 1018          fnvlist_free(snaps);
1019 1019          return (err);
1020 1020  }
1021 1021  
1022 1022  static void
1023 1023  dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx)
1024 1024  {
1025 1025          dnode_t *dn;
1026 1026  
1027 1027          while (dn = list_head(list)) {
1028 1028                  ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
1029 1029                  ASSERT(dn->dn_dbuf->db_data_pending);
1030 1030                  /*
1031 1031                   * Initialize dn_zio outside dnode_sync() because the
1032 1032                   * meta-dnode needs to set it ouside dnode_sync().
1033 1033                   */
1034 1034                  dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
1035 1035                  ASSERT(dn->dn_zio);
1036 1036  
1037 1037                  ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
1038 1038                  list_remove(list, dn);
1039 1039  
1040 1040                  if (newlist) {
1041 1041                          (void) dnode_add_ref(dn, newlist);
1042 1042                          list_insert_tail(newlist, dn);
1043 1043                  }
1044 1044  
1045 1045                  dnode_sync(dn, tx);
1046 1046          }
1047 1047  }
1048 1048  
1049 1049  /* ARGSUSED */
1050 1050  static void
1051 1051  dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
1052 1052  {
1053 1053          blkptr_t *bp = zio->io_bp;
1054 1054          objset_t *os = arg;
1055 1055          dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
1056 1056  
1057 1057          ASSERT(!BP_IS_EMBEDDED(bp));
1058 1058          ASSERT3P(bp, ==, os->os_rootbp);
1059 1059          ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
1060 1060          ASSERT0(BP_GET_LEVEL(bp));
1061 1061  
1062 1062          /*
1063 1063           * Update rootbp fill count: it should be the number of objects
1064 1064           * allocated in the object set (not counting the "special"
1065 1065           * objects that are stored in the objset_phys_t -- the meta
1066 1066           * dnode and user/group accounting objects).
1067 1067           */
1068 1068          bp->blk_fill = 0;
1069 1069          for (int i = 0; i < dnp->dn_nblkptr; i++)
1070 1070                  bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
1071 1071  }
1072 1072  
1073 1073  /* ARGSUSED */
1074 1074  static void
1075 1075  dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
1076 1076  {
1077 1077          blkptr_t *bp = zio->io_bp;
1078 1078          blkptr_t *bp_orig = &zio->io_bp_orig;
1079 1079          objset_t *os = arg;
1080 1080  
1081 1081          if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
1082 1082                  ASSERT(BP_EQUAL(bp, bp_orig));
1083 1083          } else {
1084 1084                  dsl_dataset_t *ds = os->os_dsl_dataset;
1085 1085                  dmu_tx_t *tx = os->os_synctx;
1086 1086  
1087 1087                  (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
1088 1088                  dsl_dataset_block_born(ds, bp, tx);
1089 1089          }
1090 1090  }
1091 1091  
1092 1092  /* called from dsl */
1093 1093  void
1094 1094  dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
1095 1095  {
1096 1096          int txgoff;
1097 1097          zbookmark_phys_t zb;
1098 1098          zio_prop_t zp;
1099 1099          zio_t *zio;
1100 1100          list_t *list;
1101 1101          list_t *newlist = NULL;
1102 1102          dbuf_dirty_record_t *dr;
1103 1103  
1104 1104          dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
1105 1105  
1106 1106          ASSERT(dmu_tx_is_syncing(tx));
1107 1107          /* XXX the write_done callback should really give us the tx... */
1108 1108          os->os_synctx = tx;
1109 1109  
1110 1110          if (os->os_dsl_dataset == NULL) {
1111 1111                  /*
1112 1112                   * This is the MOS.  If we have upgraded,
1113 1113                   * spa_max_replication() could change, so reset
1114 1114                   * os_copies here.
1115 1115                   */
1116 1116                  os->os_copies = spa_max_replication(os->os_spa);
1117 1117          }
1118 1118  
1119 1119          /*
1120 1120           * Create the root block IO
1121 1121           */
1122 1122          SET_BOOKMARK(&zb, os->os_dsl_dataset ?
1123 1123              os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
1124 1124              ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
1125 1125          arc_release(os->os_phys_buf, &os->os_phys_buf);
1126 1126  
1127 1127          dmu_write_policy(os, NULL, 0, 0, &zp);
1128 1128  
1129 1129          zio = arc_write(pio, os->os_spa, tx->tx_txg,
1130 1130              os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
1131 1131              DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready,
1132 1132              NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE,
1133 1133              ZIO_FLAG_MUSTSUCCEED, &zb);
1134 1134  
1135 1135          /*
1136 1136           * Sync special dnodes - the parent IO for the sync is the root block
1137 1137           */
1138 1138          DMU_META_DNODE(os)->dn_zio = zio;
1139 1139          dnode_sync(DMU_META_DNODE(os), tx);
1140 1140  
1141 1141          os->os_phys->os_flags = os->os_flags;
1142 1142  
1143 1143          if (DMU_USERUSED_DNODE(os) &&
1144 1144              DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
1145 1145                  DMU_USERUSED_DNODE(os)->dn_zio = zio;
1146 1146                  dnode_sync(DMU_USERUSED_DNODE(os), tx);
1147 1147                  DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
1148 1148                  dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
1149 1149          }
1150 1150  
1151 1151          txgoff = tx->tx_txg & TXG_MASK;
1152 1152  
1153 1153          if (dmu_objset_userused_enabled(os)) {
1154 1154                  newlist = &os->os_synced_dnodes;
1155 1155                  /*
1156 1156                   * We must create the list here because it uses the
1157 1157                   * dn_dirty_link[] of this txg.
1158 1158                   */
1159 1159                  list_create(newlist, sizeof (dnode_t),
1160 1160                      offsetof(dnode_t, dn_dirty_link[txgoff]));
1161 1161          }
1162 1162  
1163 1163          dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx);
1164 1164          dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx);
1165 1165  
1166 1166          list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
1167 1167          while (dr = list_head(list)) {
1168 1168                  ASSERT0(dr->dr_dbuf->db_level);
1169 1169                  list_remove(list, dr);
1170 1170                  if (dr->dr_zio)
1171 1171                          zio_nowait(dr->dr_zio);
1172 1172          }
1173 1173          /*
1174 1174           * Free intent log blocks up to this tx.
1175 1175           */
1176 1176          zil_sync(os->os_zil, tx);
1177 1177          os->os_phys->os_zil_header = os->os_zil_header;
1178 1178          zio_nowait(zio);
1179 1179  }
1180 1180  
1181 1181  boolean_t
1182 1182  dmu_objset_is_dirty(objset_t *os, uint64_t txg)
1183 1183  {
1184 1184          return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) ||
1185 1185              !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK]));
1186 1186  }
1187 1187  
1188 1188  static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
1189 1189  
1190 1190  void
1191 1191  dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
1192 1192  {
1193 1193          used_cbs[ost] = cb;
1194 1194  }
1195 1195  
1196 1196  boolean_t
1197 1197  dmu_objset_userused_enabled(objset_t *os)
1198 1198  {
1199 1199          return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
1200 1200              used_cbs[os->os_phys->os_type] != NULL &&
1201 1201              DMU_USERUSED_DNODE(os) != NULL);
1202 1202  }
1203 1203  
1204 1204  static void
1205 1205  do_userquota_update(objset_t *os, uint64_t used, uint64_t flags,
1206 1206      uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx)
1207 1207  {
1208 1208          if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
1209 1209                  int64_t delta = DNODE_SIZE + used;
1210 1210                  if (subtract)
1211 1211                          delta = -delta;
1212 1212                  VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT,
1213 1213                      user, delta, tx));
1214 1214                  VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT,
1215 1215                      group, delta, tx));
1216 1216          }
1217 1217  }
1218 1218  
1219 1219  void
1220 1220  dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
1221 1221  {
1222 1222          dnode_t *dn;
1223 1223          list_t *list = &os->os_synced_dnodes;
1224 1224  
1225 1225          ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os));
1226 1226  
1227 1227          while (dn = list_head(list)) {
1228 1228                  int flags;
1229 1229                  ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
1230 1230                  ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
1231 1231                      dn->dn_phys->dn_flags &
1232 1232                      DNODE_FLAG_USERUSED_ACCOUNTED);
1233 1233  
1234 1234                  /* Allocate the user/groupused objects if necessary. */
1235 1235                  if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
1236 1236                          VERIFY(0 == zap_create_claim(os,
1237 1237                              DMU_USERUSED_OBJECT,
1238 1238                              DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
1239 1239                          VERIFY(0 == zap_create_claim(os,
1240 1240                              DMU_GROUPUSED_OBJECT,
1241 1241                              DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
1242 1242                  }
1243 1243  
1244 1244                  /*
1245 1245                   * We intentionally modify the zap object even if the
1246 1246                   * net delta is zero.  Otherwise
1247 1247                   * the block of the zap obj could be shared between
1248 1248                   * datasets but need to be different between them after
1249 1249                   * a bprewrite.
1250 1250                   */
1251 1251  
1252 1252                  flags = dn->dn_id_flags;
1253 1253                  ASSERT(flags);
1254 1254                  if (flags & DN_ID_OLD_EXIST)  {
1255 1255                          do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags,
1256 1256                              dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx);
1257 1257                  }
1258 1258                  if (flags & DN_ID_NEW_EXIST) {
1259 1259                          do_userquota_update(os, DN_USED_BYTES(dn->dn_phys),
1260 1260                              dn->dn_phys->dn_flags,  dn->dn_newuid,
1261 1261                              dn->dn_newgid, B_FALSE, tx);
1262 1262                  }
1263 1263  
1264 1264                  mutex_enter(&dn->dn_mtx);
1265 1265                  dn->dn_oldused = 0;
1266 1266                  dn->dn_oldflags = 0;
1267 1267                  if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
1268 1268                          dn->dn_olduid = dn->dn_newuid;
1269 1269                          dn->dn_oldgid = dn->dn_newgid;
1270 1270                          dn->dn_id_flags |= DN_ID_OLD_EXIST;
1271 1271                          if (dn->dn_bonuslen == 0)
1272 1272                                  dn->dn_id_flags |= DN_ID_CHKED_SPILL;
1273 1273                          else
1274 1274                                  dn->dn_id_flags |= DN_ID_CHKED_BONUS;
1275 1275                  }
1276 1276                  dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
1277 1277                  mutex_exit(&dn->dn_mtx);
1278 1278  
1279 1279                  list_remove(list, dn);
1280 1280                  dnode_rele(dn, list);
1281 1281          }
1282 1282  }
1283 1283  
1284 1284  /*
1285 1285   * Returns a pointer to data to find uid/gid from
1286 1286   *
1287 1287   * If a dirty record for transaction group that is syncing can't
1288 1288   * be found then NULL is returned.  In the NULL case it is assumed
1289 1289   * the uid/gid aren't changing.
1290 1290   */
1291 1291  static void *
1292 1292  dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)
1293 1293  {
1294 1294          dbuf_dirty_record_t *dr, **drp;
1295 1295          void *data;
1296 1296  
1297 1297          if (db->db_dirtycnt == 0)
1298 1298                  return (db->db.db_data);  /* Nothing is changing */
1299 1299  
1300 1300          for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1301 1301                  if (dr->dr_txg == tx->tx_txg)
1302 1302                          break;
1303 1303  
1304 1304          if (dr == NULL) {
1305 1305                  data = NULL;
1306 1306          } else {
1307 1307                  dnode_t *dn;
1308 1308  
1309 1309                  DB_DNODE_ENTER(dr->dr_dbuf);
1310 1310                  dn = DB_DNODE(dr->dr_dbuf);
1311 1311  
1312 1312                  if (dn->dn_bonuslen == 0 &&
1313 1313                      dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
1314 1314                          data = dr->dt.dl.dr_data->b_data;
1315 1315                  else
1316 1316                          data = dr->dt.dl.dr_data;
1317 1317  
1318 1318                  DB_DNODE_EXIT(dr->dr_dbuf);
1319 1319          }
1320 1320  
1321 1321          return (data);
1322 1322  }
1323 1323  
1324 1324  void
1325 1325  dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
1326 1326  {
1327 1327          objset_t *os = dn->dn_objset;
1328 1328          void *data = NULL;
1329 1329          dmu_buf_impl_t *db = NULL;
1330 1330          uint64_t *user = NULL;
1331 1331          uint64_t *group = NULL;
1332 1332          int flags = dn->dn_id_flags;
1333 1333          int error;
1334 1334          boolean_t have_spill = B_FALSE;
1335 1335  
1336 1336          if (!dmu_objset_userused_enabled(dn->dn_objset))
1337 1337                  return;
1338 1338  
1339 1339          if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST|
1340 1340              DN_ID_CHKED_SPILL)))
1341 1341                  return;
1342 1342  
1343 1343          if (before && dn->dn_bonuslen != 0)
1344 1344                  data = DN_BONUS(dn->dn_phys);
1345 1345          else if (!before && dn->dn_bonuslen != 0) {
1346 1346                  if (dn->dn_bonus) {
1347 1347                          db = dn->dn_bonus;
1348 1348                          mutex_enter(&db->db_mtx);
1349 1349                          data = dmu_objset_userquota_find_data(db, tx);
1350 1350                  } else {
1351 1351                          data = DN_BONUS(dn->dn_phys);
1352 1352                  }
1353 1353          } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {
1354 1354                          int rf = 0;
1355 1355  
1356 1356                          if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
1357 1357                                  rf |= DB_RF_HAVESTRUCT;
1358 1358                          error = dmu_spill_hold_by_dnode(dn,
1359 1359                              rf | DB_RF_MUST_SUCCEED,
1360 1360                              FTAG, (dmu_buf_t **)&db);
1361 1361                          ASSERT(error == 0);
1362 1362                          mutex_enter(&db->db_mtx);
1363 1363                          data = (before) ? db->db.db_data :
1364 1364                              dmu_objset_userquota_find_data(db, tx);
1365 1365                          have_spill = B_TRUE;
1366 1366          } else {
1367 1367                  mutex_enter(&dn->dn_mtx);
1368 1368                  dn->dn_id_flags |= DN_ID_CHKED_BONUS;
1369 1369                  mutex_exit(&dn->dn_mtx);
1370 1370                  return;
1371 1371          }
1372 1372  
1373 1373          if (before) {
1374 1374                  ASSERT(data);
1375 1375                  user = &dn->dn_olduid;
1376 1376                  group = &dn->dn_oldgid;
1377 1377          } else if (data) {
1378 1378                  user = &dn->dn_newuid;
1379 1379                  group = &dn->dn_newgid;
1380 1380          }
1381 1381  
1382 1382          /*
1383 1383           * Must always call the callback in case the object
1384 1384           * type has changed and that type isn't an object type to track
1385 1385           */
1386 1386          error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data,
1387 1387              user, group);
1388 1388  
1389 1389          /*
1390 1390           * Preserve existing uid/gid when the callback can't determine
1391 1391           * what the new uid/gid are and the callback returned EEXIST.
1392 1392           * The EEXIST error tells us to just use the existing uid/gid.
1393 1393           * If we don't know what the old values are then just assign
1394 1394           * them to 0, since that is a new file  being created.
1395 1395           */
1396 1396          if (!before && data == NULL && error == EEXIST) {
1397 1397                  if (flags & DN_ID_OLD_EXIST) {
1398 1398                          dn->dn_newuid = dn->dn_olduid;
1399 1399                          dn->dn_newgid = dn->dn_oldgid;
1400 1400                  } else {
1401 1401                          dn->dn_newuid = 0;
1402 1402                          dn->dn_newgid = 0;
1403 1403                  }
1404 1404                  error = 0;
1405 1405          }
1406 1406  
1407 1407          if (db)
1408 1408                  mutex_exit(&db->db_mtx);
1409 1409  
1410 1410          mutex_enter(&dn->dn_mtx);
1411 1411          if (error == 0 && before)
1412 1412                  dn->dn_id_flags |= DN_ID_OLD_EXIST;
1413 1413          if (error == 0 && !before)
1414 1414                  dn->dn_id_flags |= DN_ID_NEW_EXIST;
1415 1415  
1416 1416          if (have_spill) {
1417 1417                  dn->dn_id_flags |= DN_ID_CHKED_SPILL;
1418 1418          } else {
1419 1419                  dn->dn_id_flags |= DN_ID_CHKED_BONUS;
1420 1420          }
1421 1421          mutex_exit(&dn->dn_mtx);
1422 1422          if (have_spill)
1423 1423                  dmu_buf_rele((dmu_buf_t *)db, FTAG);
1424 1424  }
1425 1425  
1426 1426  boolean_t
1427 1427  dmu_objset_userspace_present(objset_t *os)
1428 1428  {
1429 1429          return (os->os_phys->os_flags &
1430 1430              OBJSET_FLAG_USERACCOUNTING_COMPLETE);
1431 1431  }
1432 1432  
1433 1433  int
1434 1434  dmu_objset_userspace_upgrade(objset_t *os)
1435 1435  {
1436 1436          uint64_t obj;
1437 1437          int err = 0;
1438 1438  
1439 1439          if (dmu_objset_userspace_present(os))
1440 1440                  return (0);
1441 1441          if (!dmu_objset_userused_enabled(os))
1442 1442                  return (SET_ERROR(ENOTSUP));
1443 1443          if (dmu_objset_is_snapshot(os))
1444 1444                  return (SET_ERROR(EINVAL));
1445 1445  
1446 1446          /*
1447 1447           * We simply need to mark every object dirty, so that it will be
1448 1448           * synced out and now accounted.  If this is called
1449 1449           * concurrently, or if we already did some work before crashing,
1450 1450           * that's fine, since we track each object's accounted state
1451 1451           * independently.
1452 1452           */
1453 1453  
1454 1454          for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
1455 1455                  dmu_tx_t *tx;
1456 1456                  dmu_buf_t *db;
1457 1457                  int objerr;
1458 1458  
1459 1459                  if (issig(JUSTLOOKING) && issig(FORREAL))
1460 1460                          return (SET_ERROR(EINTR));
1461 1461  
1462 1462                  objerr = dmu_bonus_hold(os, obj, FTAG, &db);
1463 1463                  if (objerr != 0)
1464 1464                          continue;
1465 1465                  tx = dmu_tx_create(os);
1466 1466                  dmu_tx_hold_bonus(tx, obj);
1467 1467                  objerr = dmu_tx_assign(tx, TXG_WAIT);
1468 1468                  if (objerr != 0) {
1469 1469                          dmu_tx_abort(tx);
1470 1470                          continue;
1471 1471                  }
1472 1472                  dmu_buf_will_dirty(db, tx);
1473 1473                  dmu_buf_rele(db, FTAG);
1474 1474                  dmu_tx_commit(tx);
1475 1475          }
1476 1476  
1477 1477          os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
1478 1478          txg_wait_synced(dmu_objset_pool(os), 0);
1479 1479          return (0);
1480 1480  }
1481 1481  
1482 1482  void
1483 1483  dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
1484 1484      uint64_t *usedobjsp, uint64_t *availobjsp)
1485 1485  {
1486 1486          dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp,
1487 1487              usedobjsp, availobjsp);
1488 1488  }
1489 1489  
1490 1490  uint64_t
1491 1491  dmu_objset_fsid_guid(objset_t *os)
1492 1492  {
1493 1493          return (dsl_dataset_fsid_guid(os->os_dsl_dataset));
1494 1494  }
1495 1495  
1496 1496  void
1497 1497  dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat)
1498 1498  {
1499 1499          stat->dds_type = os->os_phys->os_type;
1500 1500          if (os->os_dsl_dataset)
1501 1501                  dsl_dataset_fast_stat(os->os_dsl_dataset, stat);
1502 1502  }
1503 1503  
1504 1504  void
1505 1505  dmu_objset_stats(objset_t *os, nvlist_t *nv)
1506 1506  {
1507 1507          ASSERT(os->os_dsl_dataset ||
1508 1508              os->os_phys->os_type == DMU_OST_META);
1509 1509  
1510 1510          if (os->os_dsl_dataset != NULL)
1511 1511                  dsl_dataset_stats(os->os_dsl_dataset, nv);
1512 1512  
1513 1513          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
1514 1514              os->os_phys->os_type);
1515 1515          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,
1516 1516              dmu_objset_userspace_present(os));
1517 1517  }
1518 1518  
1519 1519  int
1520 1520  dmu_objset_is_snapshot(objset_t *os)
1521 1521  {
1522 1522          if (os->os_dsl_dataset != NULL)
1523 1523                  return (os->os_dsl_dataset->ds_is_snapshot);
1524 1524          else
1525 1525                  return (B_FALSE);
1526 1526  }
1527 1527  
1528 1528  int
1529 1529  dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen,
1530 1530      boolean_t *conflict)
1531 1531  {
1532 1532          dsl_dataset_t *ds = os->os_dsl_dataset;
1533 1533          uint64_t ignored;
1534 1534  
1535 1535          if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
1536 1536                  return (SET_ERROR(ENOENT));
1537 1537  
1538 1538          return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,
1539 1539              dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored,
1540 1540              MT_FIRST, real, maxlen, conflict));
1541 1541  }
1542 1542  
1543 1543  int
1544 1544  dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
1545 1545      uint64_t *idp, uint64_t *offp, boolean_t *case_conflict)
1546 1546  {
1547 1547          dsl_dataset_t *ds = os->os_dsl_dataset;
1548 1548          zap_cursor_t cursor;
1549 1549          zap_attribute_t attr;
1550 1550  
1551 1551          ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
1552 1552  
1553 1553          if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
1554 1554                  return (SET_ERROR(ENOENT));
1555 1555  
1556 1556          zap_cursor_init_serialized(&cursor,
1557 1557              ds->ds_dir->dd_pool->dp_meta_objset,
1558 1558              dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp);
1559 1559  
1560 1560          if (zap_cursor_retrieve(&cursor, &attr) != 0) {
1561 1561                  zap_cursor_fini(&cursor);
1562 1562                  return (SET_ERROR(ENOENT));
1563 1563          }
1564 1564  
1565 1565          if (strlen(attr.za_name) + 1 > namelen) {
1566 1566                  zap_cursor_fini(&cursor);
1567 1567                  return (SET_ERROR(ENAMETOOLONG));
1568 1568          }
1569 1569  
1570 1570          (void) strcpy(name, attr.za_name);
1571 1571          if (idp)
1572 1572                  *idp = attr.za_first_integer;
1573 1573          if (case_conflict)
1574 1574                  *case_conflict = attr.za_normalization_conflict;
1575 1575          zap_cursor_advance(&cursor);
1576 1576          *offp = zap_cursor_serialize(&cursor);
1577 1577          zap_cursor_fini(&cursor);
1578 1578  
1579 1579          return (0);
1580 1580  }
1581 1581  
1582 1582  int
1583 1583  dmu_dir_list_next(objset_t *os, int namelen, char *name,
1584 1584      uint64_t *idp, uint64_t *offp)
1585 1585  {
1586 1586          dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
1587 1587          zap_cursor_t cursor;
1588 1588          zap_attribute_t attr;
1589 1589  
1590 1590          /* there is no next dir on a snapshot! */
1591 1591          if (os->os_dsl_dataset->ds_object !=
1592 1592              dsl_dir_phys(dd)->dd_head_dataset_obj)
1593 1593                  return (SET_ERROR(ENOENT));
1594 1594  
1595 1595          zap_cursor_init_serialized(&cursor,
1596 1596              dd->dd_pool->dp_meta_objset,
1597 1597              dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp);
1598 1598  
1599 1599          if (zap_cursor_retrieve(&cursor, &attr) != 0) {
1600 1600                  zap_cursor_fini(&cursor);
1601 1601                  return (SET_ERROR(ENOENT));
1602 1602          }
1603 1603  
1604 1604          if (strlen(attr.za_name) + 1 > namelen) {
1605 1605                  zap_cursor_fini(&cursor);
1606 1606                  return (SET_ERROR(ENAMETOOLONG));
1607 1607          }
1608 1608  
1609 1609          (void) strcpy(name, attr.za_name);
1610 1610          if (idp)
1611 1611                  *idp = attr.za_first_integer;
1612 1612          zap_cursor_advance(&cursor);
1613 1613          *offp = zap_cursor_serialize(&cursor);
1614 1614          zap_cursor_fini(&cursor);
1615 1615  
1616 1616          return (0);
1617 1617  }
1618 1618  
1619 1619  typedef struct dmu_objset_find_ctx {
1620 1620          taskq_t         *dc_tq;
1621 1621          dsl_pool_t      *dc_dp;
1622 1622          uint64_t        dc_ddobj;
1623 1623          int             (*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *);
1624 1624          void            *dc_arg;
1625 1625          int             dc_flags;
1626 1626          kmutex_t        *dc_error_lock;
1627 1627          int             *dc_error;
1628 1628  } dmu_objset_find_ctx_t;
1629 1629  
1630 1630  static void
1631 1631  dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
1632 1632  {
1633 1633          dsl_pool_t *dp = dcp->dc_dp;
1634 1634          dmu_objset_find_ctx_t *child_dcp;
1635 1635          dsl_dir_t *dd;
1636 1636          dsl_dataset_t *ds;
1637 1637          zap_cursor_t zc;
1638 1638          zap_attribute_t *attr;
1639 1639          uint64_t thisobj;
1640 1640          int err = 0;
1641 1641  
1642 1642          /* don't process if there already was an error */
1643 1643          if (*dcp->dc_error != 0)
1644 1644                  goto out;
1645 1645  
1646 1646          err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, NULL, FTAG, &dd);
1647 1647          if (err != 0)
1648 1648                  goto out;
1649 1649  
1650 1650          /* Don't visit hidden ($MOS & $ORIGIN) objsets. */
1651 1651          if (dd->dd_myname[0] == '$') {
1652 1652                  dsl_dir_rele(dd, FTAG);
1653 1653                  goto out;
1654 1654          }
1655 1655  
1656 1656          thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
1657 1657          attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
1658 1658  
1659 1659          /*
1660 1660           * Iterate over all children.
1661 1661           */
1662 1662          if (dcp->dc_flags & DS_FIND_CHILDREN) {
1663 1663                  for (zap_cursor_init(&zc, dp->dp_meta_objset,
1664 1664                      dsl_dir_phys(dd)->dd_child_dir_zapobj);
1665 1665                      zap_cursor_retrieve(&zc, attr) == 0;
1666 1666                      (void) zap_cursor_advance(&zc)) {
1667 1667                          ASSERT3U(attr->za_integer_length, ==,
1668 1668                              sizeof (uint64_t));
1669 1669                          ASSERT3U(attr->za_num_integers, ==, 1);
1670 1670  
1671 1671                          child_dcp = kmem_alloc(sizeof(*child_dcp), KM_SLEEP);
1672 1672                          *child_dcp = *dcp;
1673 1673                          child_dcp->dc_ddobj = attr->za_first_integer;
1674 1674                          if (dcp->dc_tq != NULL)
1675 1675                                  (void) taskq_dispatch(dcp->dc_tq,
1676 1676                                      dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP);
1677 1677                          else
1678 1678                                  dmu_objset_find_dp_impl(child_dcp);
1679 1679                  }
1680 1680                  zap_cursor_fini(&zc);
1681 1681          }
1682 1682  
1683 1683          /*
1684 1684           * Iterate over all snapshots.
1685 1685           */
1686 1686          if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {
1687 1687                  dsl_dataset_t *ds;
1688 1688                  err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
1689 1689  
1690 1690                  if (err == 0) {
1691 1691                          uint64_t snapobj;
1692 1692  
1693 1693                          snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
1694 1694                          dsl_dataset_rele(ds, FTAG);
1695 1695  
1696 1696                          for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
1697 1697                              zap_cursor_retrieve(&zc, attr) == 0;
1698 1698                              (void) zap_cursor_advance(&zc)) {
1699 1699                                  ASSERT3U(attr->za_integer_length, ==,
1700 1700                                      sizeof (uint64_t));
1701 1701                                  ASSERT3U(attr->za_num_integers, ==, 1);
1702 1702  
1703 1703                                  err = dsl_dataset_hold_obj(dp,
1704 1704                                      attr->za_first_integer, FTAG, &ds);
1705 1705                                  if (err != 0)
1706 1706                                          break;
1707 1707                                  err = dcp->dc_func(dp, ds, dcp->dc_arg);
1708 1708                                  dsl_dataset_rele(ds, FTAG);
1709 1709                                  if (err != 0)
1710 1710                                          break;
1711 1711                          }
1712 1712                          zap_cursor_fini(&zc);
1713 1713                  }
1714 1714          }
1715 1715  
1716 1716          dsl_dir_rele(dd, FTAG);
1717 1717          kmem_free(attr, sizeof (zap_attribute_t));
1718 1718  
1719 1719          if (err != 0)
1720 1720                  goto out;
1721 1721  
1722 1722          /*
1723 1723           * Apply to self.
1724 1724           */
1725 1725          err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
1726 1726          if (err != 0)
1727 1727                  goto out;
1728 1728          err = dcp->dc_func(dp, ds, dcp->dc_arg);
1729 1729          dsl_dataset_rele(ds, FTAG);
1730 1730  
1731 1731  out:
1732 1732          if (err != 0) {
1733 1733                  mutex_enter(dcp->dc_error_lock);
1734 1734                  /* only keep first error */
1735 1735                  if (*dcp->dc_error == 0)
1736 1736                          *dcp->dc_error = err;
1737 1737                  mutex_exit(dcp->dc_error_lock);
1738 1738          }
  
    | ↓ open down ↓ | 1738 lines elided | ↑ open up ↑ | 
1739 1739  
1740 1740          kmem_free(dcp, sizeof(*dcp));
1741 1741  }
1742 1742  
1743 1743  static void
1744 1744  dmu_objset_find_dp_cb(void *arg)
1745 1745  {
1746 1746          dmu_objset_find_ctx_t *dcp = arg;
1747 1747          dsl_pool_t *dp = dcp->dc_dp;
1748 1748  
1749      -        dsl_pool_config_enter(dp, FTAG);
     1749 +        /*
     1750 +         * We need to get a pool_config_lock here, as there are several
     1751 +         * asssert(pool_config_held) down the stack. Getting a lock via
     1752 +         * dsl_pool_config_enter is risky, as it might be stalled by a
     1753 +         * pending writer. This would deadlock, as the write lock can
     1754 +         * only be granted when our parent thread gives up the lock.
     1755 +         * The _prio interface gives us priority over a pending writer.
     1756 +         * On the other hand, we don't risk to stall any pending writers,
     1757 +         * as the parent thread already holds a config lock. We give up
     1758 +         * our lock before the parent does, so in effect we do not prolong
     1759 +         * the waiting time for the writer.
     1760 +         */
     1761 +        dsl_pool_config_enter_prio(dp, FTAG);
1750 1762  
1751 1763          dmu_objset_find_dp_impl(dcp);
1752 1764  
1753 1765          dsl_pool_config_exit(dp, FTAG);
1754 1766  }
1755 1767  
1756 1768  /*
1757 1769   * Find objsets under and including ddobj, call func(ds) on each.
1758 1770   * The order for the enumeration is completely undefined.
1759 1771   * func is called with dsl_pool_config held.
1760 1772   */
1761 1773  int
1762 1774  dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
1763 1775      int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
1764 1776  {
1765 1777          int error = 0;
1766 1778          taskq_t *tq = NULL;
1767 1779          int ntasks;
1768 1780          dmu_objset_find_ctx_t *dcp;
1769 1781          kmutex_t err_lock;
1770 1782  
1771 1783          mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);
1772 1784          dcp = kmem_alloc(sizeof(*dcp), KM_SLEEP);
1773 1785          dcp->dc_tq = NULL;
1774 1786          dcp->dc_dp = dp;
1775 1787          dcp->dc_ddobj = ddobj;
1776 1788          dcp->dc_func = func;
1777 1789          dcp->dc_arg = arg;
1778 1790          dcp->dc_flags = flags;
1779 1791          dcp->dc_error_lock = &err_lock;
1780 1792          dcp->dc_error = &error;
1781 1793  
1782 1794          if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) {
1783 1795                  /*
1784 1796                   * In case a write lock is held we can't make use of
1785 1797                   * parallelism, as down the stack of the worker threads
1786 1798                   * the lock is asserted via dsl_pool_config_held.
1787 1799                   * In case of a read lock this is solved by getting a read
1788 1800                   * lock in each worker thread, which isn't possible in case
1789 1801                   * of a writer lock. So we fall back to the synchronous path
1790 1802                   * here.
1791 1803                   * In the future it might be possible to get some magic into
1792 1804                   * dsl_pool_config_held in a way that it returns true for
1793 1805                   * the worker threads so that a single lock held from this
1794 1806                   * thread suffices. For now, stay single threaded.
1795 1807                   */
1796 1808                  dmu_objset_find_dp_impl(dcp);
1797 1809  
1798 1810                  return (error);
1799 1811          }
1800 1812  
1801 1813          ntasks = dmu_find_threads;
1802 1814          if (ntasks == 0)
1803 1815                  ntasks = vdev_count_leaves(dp->dp_spa) * 4;
1804 1816          tq = taskq_create("dmu_objset_find", ntasks, minclsyspri, ntasks,
1805 1817              INT_MAX, 0);
1806 1818          if (tq == NULL) {
1807 1819                  kmem_free(dcp, sizeof(*dcp));
1808 1820                  return (SET_ERROR(ENOMEM));
1809 1821          }
1810 1822          dcp->dc_tq = tq;
1811 1823  
1812 1824          /* dcp will be freed by task */
1813 1825          (void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);
1814 1826  
1815 1827          /*
1816 1828           * PORTING: this code relies on the property of taskq_wait to wait
1817 1829           * until no more tasks are queued and no more tasks are active. As
1818 1830           * we always queue new tasks from within other tasks, task_wait
1819 1831           * reliably waits for the full recursion to finish, even though we
1820 1832           * enqueue new tasks after taskq_wait has been called.
1821 1833           * On platforms other than illumos, taskq_wait may not have this
1822 1834           * property.
1823 1835           */
1824 1836          taskq_wait(tq);
1825 1837          taskq_destroy(tq);
1826 1838          mutex_destroy(&err_lock);
1827 1839  
1828 1840          return (error);
1829 1841  }
1830 1842  
1831 1843  /*
1832 1844   * Find all objsets under name, and for each, call 'func(child_name, arg)'.
1833 1845   * The dp_config_rwlock must not be held when this is called, and it
1834 1846   * will not be held when the callback is called.
1835 1847   * Therefore this function should only be used when the pool is not changing
1836 1848   * (e.g. in syncing context), or the callback can deal with the possible races.
1837 1849   */
1838 1850  static int
1839 1851  dmu_objset_find_impl(spa_t *spa, const char *name,
1840 1852      int func(const char *, void *), void *arg, int flags)
1841 1853  {
1842 1854          dsl_dir_t *dd;
1843 1855          dsl_pool_t *dp = spa_get_dsl(spa);
1844 1856          dsl_dataset_t *ds;
1845 1857          zap_cursor_t zc;
1846 1858          zap_attribute_t *attr;
1847 1859          char *child;
1848 1860          uint64_t thisobj;
1849 1861          int err;
1850 1862  
1851 1863          dsl_pool_config_enter(dp, FTAG);
1852 1864  
1853 1865          err = dsl_dir_hold(dp, name, FTAG, &dd, NULL);
1854 1866          if (err != 0) {
1855 1867                  dsl_pool_config_exit(dp, FTAG);
1856 1868                  return (err);
1857 1869          }
1858 1870  
1859 1871          /* Don't visit hidden ($MOS & $ORIGIN) objsets. */
1860 1872          if (dd->dd_myname[0] == '$') {
1861 1873                  dsl_dir_rele(dd, FTAG);
1862 1874                  dsl_pool_config_exit(dp, FTAG);
1863 1875                  return (0);
1864 1876          }
1865 1877  
1866 1878          thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
1867 1879          attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
1868 1880  
1869 1881          /*
1870 1882           * Iterate over all children.
1871 1883           */
1872 1884          if (flags & DS_FIND_CHILDREN) {
1873 1885                  for (zap_cursor_init(&zc, dp->dp_meta_objset,
1874 1886                      dsl_dir_phys(dd)->dd_child_dir_zapobj);
1875 1887                      zap_cursor_retrieve(&zc, attr) == 0;
1876 1888                      (void) zap_cursor_advance(&zc)) {
1877 1889                          ASSERT3U(attr->za_integer_length, ==,
1878 1890                              sizeof (uint64_t));
1879 1891                          ASSERT3U(attr->za_num_integers, ==, 1);
1880 1892  
1881 1893                          child = kmem_asprintf("%s/%s", name, attr->za_name);
1882 1894                          dsl_pool_config_exit(dp, FTAG);
1883 1895                          err = dmu_objset_find_impl(spa, child,
1884 1896                              func, arg, flags);
1885 1897                          dsl_pool_config_enter(dp, FTAG);
1886 1898                          strfree(child);
1887 1899                          if (err != 0)
1888 1900                                  break;
1889 1901                  }
1890 1902                  zap_cursor_fini(&zc);
1891 1903  
1892 1904                  if (err != 0) {
1893 1905                          dsl_dir_rele(dd, FTAG);
1894 1906                          dsl_pool_config_exit(dp, FTAG);
1895 1907                          kmem_free(attr, sizeof (zap_attribute_t));
1896 1908                          return (err);
1897 1909                  }
1898 1910          }
1899 1911  
1900 1912          /*
1901 1913           * Iterate over all snapshots.
1902 1914           */
1903 1915          if (flags & DS_FIND_SNAPSHOTS) {
1904 1916                  err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
1905 1917  
1906 1918                  if (err == 0) {
1907 1919                          uint64_t snapobj;
1908 1920  
1909 1921                          snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
1910 1922                          dsl_dataset_rele(ds, FTAG);
1911 1923  
1912 1924                          for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
1913 1925                              zap_cursor_retrieve(&zc, attr) == 0;
1914 1926                              (void) zap_cursor_advance(&zc)) {
1915 1927                                  ASSERT3U(attr->za_integer_length, ==,
1916 1928                                      sizeof (uint64_t));
1917 1929                                  ASSERT3U(attr->za_num_integers, ==, 1);
1918 1930  
1919 1931                                  child = kmem_asprintf("%s@%s",
1920 1932                                      name, attr->za_name);
1921 1933                                  dsl_pool_config_exit(dp, FTAG);
1922 1934                                  err = func(child, arg);
1923 1935                                  dsl_pool_config_enter(dp, FTAG);
1924 1936                                  strfree(child);
1925 1937                                  if (err != 0)
1926 1938                                          break;
1927 1939                          }
1928 1940                          zap_cursor_fini(&zc);
1929 1941                  }
1930 1942          }
1931 1943  
1932 1944          dsl_dir_rele(dd, FTAG);
1933 1945          kmem_free(attr, sizeof (zap_attribute_t));
1934 1946          dsl_pool_config_exit(dp, FTAG);
1935 1947  
1936 1948          if (err != 0)
1937 1949                  return (err);
1938 1950  
1939 1951          /* Apply to self. */
1940 1952          return (func(name, arg));
1941 1953  }
1942 1954  
1943 1955  /*
1944 1956   * See comment above dmu_objset_find_impl().
1945 1957   */
1946 1958  int
1947 1959  dmu_objset_find(char *name, int func(const char *, void *), void *arg,
1948 1960      int flags)
1949 1961  {
1950 1962          spa_t *spa;
1951 1963          int error;
1952 1964  
1953 1965          error = spa_open(name, &spa, FTAG);
1954 1966          if (error != 0)
1955 1967                  return (error);
1956 1968          error = dmu_objset_find_impl(spa, name, func, arg, flags);
1957 1969          spa_close(spa, FTAG);
1958 1970          return (error);
1959 1971  }
1960 1972  
1961 1973  void
1962 1974  dmu_objset_set_user(objset_t *os, void *user_ptr)
1963 1975  {
1964 1976          ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
1965 1977          os->os_user_ptr = user_ptr;
1966 1978  }
1967 1979  
1968 1980  void *
1969 1981  dmu_objset_get_user(objset_t *os)
1970 1982  {
1971 1983          ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
1972 1984          return (os->os_user_ptr);
1973 1985  }
1974 1986  
1975 1987  /*
1976 1988   * Determine name of filesystem, given name of snapshot.
1977 1989   * buf must be at least MAXNAMELEN bytes
1978 1990   */
1979 1991  int
1980 1992  dmu_fsname(const char *snapname, char *buf)
1981 1993  {
1982 1994          char *atp = strchr(snapname, '@');
1983 1995          if (atp == NULL)
1984 1996                  return (SET_ERROR(EINVAL));
1985 1997          if (atp - snapname >= MAXNAMELEN)
1986 1998                  return (SET_ERROR(ENAMETOOLONG));
1987 1999          (void) strlcpy(buf, snapname, atp - snapname + 1);
1988 2000          return (0);
1989 2001  }
  
    | ↓ open down ↓ | 230 lines elided | ↑ open up ↑ | 
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX