Print this page
    
4334 Improve ZFS N-way mirror read performance
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
          +++ new/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  
    | 
      ↓ open down ↓ | 
    13 lines elided | 
    
      ↑ open up ↑ | 
  
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2013 by Delphix. All rights reserved.
       24 + * Copyright (c) 2013 Steven Hartland. All rights reserved.
  24   25   */
  25   26  
  26   27  #ifndef _SYS_VDEV_IMPL_H
  27   28  #define _SYS_VDEV_IMPL_H
  28   29  
  29   30  #include <sys/avl.h>
  30   31  #include <sys/dmu.h>
  31   32  #include <sys/metaslab.h>
  32   33  #include <sys/nvpair.h>
  33   34  #include <sys/space_map.h>
  34   35  #include <sys/vdev.h>
  35   36  #include <sys/dkio.h>
  36   37  #include <sys/uberblock_impl.h>
  37   38  
  38   39  #ifdef  __cplusplus
  39   40  extern "C" {
  40   41  #endif
  41   42  
  42   43  /*
  43   44   * Virtual device descriptors.
  44   45   *
  45   46   * All storage pool operations go through the virtual device framework,
  46   47   * which provides data replication and I/O scheduling.
  47   48   */
  48   49  
  49   50  /*
  50   51   * Forward declarations that lots of things need.
  51   52   */
  52   53  typedef struct vdev_queue vdev_queue_t;
  53   54  typedef struct vdev_cache vdev_cache_t;
  54   55  typedef struct vdev_cache_entry vdev_cache_entry_t;
  55   56  
  56   57  /*
  57   58   * Virtual device operations
  58   59   */
  59   60  typedef int     vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size,
  60   61      uint64_t *ashift);
  61   62  typedef void    vdev_close_func_t(vdev_t *vd);
  62   63  typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
  63   64  typedef int     vdev_io_start_func_t(zio_t *zio);
  64   65  typedef void    vdev_io_done_func_t(zio_t *zio);
  65   66  typedef void    vdev_state_change_func_t(vdev_t *vd, int, int);
  66   67  typedef void    vdev_hold_func_t(vdev_t *vd);
  67   68  typedef void    vdev_rele_func_t(vdev_t *vd);
  68   69  
  69   70  typedef struct vdev_ops {
  70   71          vdev_open_func_t                *vdev_op_open;
  71   72          vdev_close_func_t               *vdev_op_close;
  72   73          vdev_asize_func_t               *vdev_op_asize;
  73   74          vdev_io_start_func_t            *vdev_op_io_start;
  74   75          vdev_io_done_func_t             *vdev_op_io_done;
  75   76          vdev_state_change_func_t        *vdev_op_state_change;
  76   77          vdev_hold_func_t                *vdev_op_hold;
  77   78          vdev_rele_func_t                *vdev_op_rele;
  78   79          char                            vdev_op_type[16];
  79   80          boolean_t                       vdev_op_leaf;
  80   81  } vdev_ops_t;
  81   82  
  82   83  /*
  83   84   * Virtual device properties
  84   85   */
  85   86  struct vdev_cache_entry {
  86   87          char            *ve_data;
  87   88          uint64_t        ve_offset;
  88   89          uint64_t        ve_lastused;
  89   90          avl_node_t      ve_offset_node;
  90   91          avl_node_t      ve_lastused_node;
  91   92          uint32_t        ve_hits;
  92   93          uint16_t        ve_missed_update;
  93   94          zio_t           *ve_fill_io;
  94   95  };
  95   96  
  96   97  struct vdev_cache {
  97   98          avl_tree_t      vc_offset_tree;
  98   99          avl_tree_t      vc_lastused_tree;
  99  100          kmutex_t        vc_lock;
 100  101  };
 101  102  
 102  103  typedef struct vdev_queue_class {
 103  104          uint32_t        vqc_active;
 104  105  
 105  106          /*
 106  107           * Sorted by offset or timestamp, depending on if the queue is
 107  108           * LBA-ordered vs FIFO.
 108  109           */
  
    | 
      ↓ open down ↓ | 
    75 lines elided | 
    
      ↑ open up ↑ | 
  
 109  110          avl_tree_t      vqc_queued_tree;
 110  111  } vdev_queue_class_t;
 111  112  
 112  113  struct vdev_queue {
 113  114          vdev_t          *vq_vdev;
 114  115          vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
 115  116          avl_tree_t      vq_active_tree;
 116  117          uint64_t        vq_last_offset;
 117  118          hrtime_t        vq_io_complete_ts; /* time last i/o completed */
 118  119          kmutex_t        vq_lock;
      120 +        uint64_t        vq_last_queued_offset;
 119  121  };
 120  122  
 121  123  /*
 122  124   * Virtual device descriptor
 123  125   */
 124  126  struct vdev {
 125  127          /*
 126  128           * Common to all vdev types.
 127  129           */
 128  130          uint64_t        vdev_id;        /* child number in vdev parent  */
 129  131          uint64_t        vdev_guid;      /* unique ID for this vdev      */
 130  132          uint64_t        vdev_guid_sum;  /* self guid + all child guids  */
 131  133          uint64_t        vdev_orig_guid; /* orig. guid prior to remove   */
 132  134          uint64_t        vdev_asize;     /* allocatable device capacity  */
 133  135          uint64_t        vdev_min_asize; /* min acceptable asize         */
 134  136          uint64_t        vdev_max_asize; /* max acceptable asize         */
 135  137          uint64_t        vdev_ashift;    /* block alignment shift        */
 136  138          uint64_t        vdev_state;     /* see VDEV_STATE_* #defines    */
 137  139          uint64_t        vdev_prevstate; /* used when reopening a vdev   */
 138  140          vdev_ops_t      *vdev_ops;      /* vdev operations              */
 139  141          spa_t           *vdev_spa;      /* spa for this vdev            */
 140  142          void            *vdev_tsd;      /* type-specific data           */
 141  143          vnode_t         *vdev_name_vp;  /* vnode for pathname           */
 142  144          vnode_t         *vdev_devid_vp; /* vnode for devid              */
 143  145          vdev_t          *vdev_top;      /* top-level vdev               */
 144  146          vdev_t          *vdev_parent;   /* parent vdev                  */
 145  147          vdev_t          **vdev_child;   /* array of children            */
 146  148          uint64_t        vdev_children;  /* number of children           */
 147  149          vdev_stat_t     vdev_stat;      /* virtual device statistics    */
 148  150          boolean_t       vdev_expanding; /* expand the vdev?             */
 149  151          boolean_t       vdev_reopening; /* reopen in progress?          */
 150  152          int             vdev_open_error; /* error on last open          */
 151  153          kthread_t       *vdev_open_thread; /* thread opening children   */
 152  154          uint64_t        vdev_crtxg;     /* txg when top-level was added */
 153  155  
 154  156          /*
 155  157           * Top-level vdev state.
 156  158           */
 157  159          uint64_t        vdev_ms_array;  /* metaslab array object        */
 158  160          uint64_t        vdev_ms_shift;  /* metaslab size shift          */
 159  161          uint64_t        vdev_ms_count;  /* number of metaslabs          */
 160  162          metaslab_group_t *vdev_mg;      /* metaslab group               */
 161  163          metaslab_t      **vdev_ms;      /* metaslab array               */
 162  164          txg_list_t      vdev_ms_list;   /* per-txg dirty metaslab lists */
 163  165          txg_list_t      vdev_dtl_list;  /* per-txg dirty DTL lists      */
 164  166          txg_node_t      vdev_txg_node;  /* per-txg dirty vdev linkage   */
 165  167          boolean_t       vdev_remove_wanted; /* async remove wanted?     */
 166  168          boolean_t       vdev_probe_wanted; /* async probe wanted?       */
 167  169          list_node_t     vdev_config_dirty_node; /* config dirty list    */
 168  170          list_node_t     vdev_state_dirty_node; /* state dirty list      */
 169  171          uint64_t        vdev_deflate_ratio; /* deflation ratio (x512)   */
 170  172          uint64_t        vdev_islog;     /* is an intent log device      */
 171  173          uint64_t        vdev_removing;  /* device is being removed?     */
 172  174          boolean_t       vdev_ishole;    /* is a hole in the namespace   */
 173  175  
 174  176          /*
 175  177           * Leaf vdev state.
 176  178           */
 177  179          range_tree_t    *vdev_dtl[DTL_TYPES]; /* dirty time logs        */
 178  180          space_map_t     *vdev_dtl_sm;   /* dirty time log space map     */
 179  181          txg_node_t      vdev_dtl_node;  /* per-txg dirty DTL linkage    */
 180  182          uint64_t        vdev_dtl_object; /* DTL object                  */
 181  183          uint64_t        vdev_psize;     /* physical device capacity     */
 182  184          uint64_t        vdev_wholedisk; /* true if this is a whole disk */
 183  185          uint64_t        vdev_offline;   /* persistent offline state     */
 184  186          uint64_t        vdev_faulted;   /* persistent faulted state     */
 185  187          uint64_t        vdev_degraded;  /* persistent degraded state    */
 186  188          uint64_t        vdev_removed;   /* persistent removed state     */
 187  189          uint64_t        vdev_resilver_txg; /* persistent resilvering state */
 188  190          uint64_t        vdev_nparity;   /* number of parity devices for raidz */
 189  191          char            *vdev_path;     /* vdev path (if any)           */
 190  192          char            *vdev_devid;    /* vdev devid (if any)          */
 191  193          char            *vdev_physpath; /* vdev device path (if any)    */
 192  194          char            *vdev_fru;      /* physical FRU location        */
 193  195          uint64_t        vdev_not_present; /* not present during import  */
 194  196          uint64_t        vdev_unspare;   /* unspare when resilvering done */
 195  197          boolean_t       vdev_nowritecache; /* true if flushwritecache failed */
 196  198          boolean_t       vdev_checkremove; /* temporary online test      */
 197  199          boolean_t       vdev_forcefault; /* force online fault          */
 198  200          boolean_t       vdev_splitting; /* split or repair in progress  */
 199  201          boolean_t       vdev_delayed_close; /* delayed device close?    */
 200  202          boolean_t       vdev_tmpoffline; /* device taken offline temporarily? */
  
    | 
      ↓ open down ↓ | 
    72 lines elided | 
    
      ↑ open up ↑ | 
  
 201  203          boolean_t       vdev_detached;  /* device detached?             */
 202  204          boolean_t       vdev_cant_read; /* vdev is failing all reads    */
 203  205          boolean_t       vdev_cant_write; /* vdev is failing all writes  */
 204  206          boolean_t       vdev_isspare;   /* was a hot spare              */
 205  207          boolean_t       vdev_isl2cache; /* was a l2cache device         */
 206  208          vdev_queue_t    vdev_queue;     /* I/O deadline schedule queue  */
 207  209          vdev_cache_t    vdev_cache;     /* physical block cache         */
 208  210          spa_aux_vdev_t  *vdev_aux;      /* for l2cache vdevs            */
 209  211          zio_t           *vdev_probe_zio; /* root of current probe       */
 210  212          vdev_aux_t      vdev_label_aux; /* on-disk aux state            */
      213 +        uint16_t        vdev_rotation_rate; /* rotational rate of the media */
      214 +#define VDEV_RATE_UNKNOWN       0
      215 +#define VDEV_RATE_NON_ROTATING  1
 211  216  
 212  217          /*
 213  218           * For DTrace to work in userland (libzpool) context, these fields must
 214  219           * remain at the end of the structure.  DTrace will use the kernel's
 215  220           * CTF definition for 'struct vdev', and since the size of a kmutex_t is
 216  221           * larger in userland, the offsets for the rest of the fields would be
 217  222           * incorrect.
 218  223           */
 219  224          kmutex_t        vdev_dtl_lock;  /* vdev_dtl_{map,resilver}      */
 220  225          kmutex_t        vdev_stat_lock; /* vdev_stat                    */
 221  226          kmutex_t        vdev_probe_lock; /* protects vdev_probe_zio     */
 222  227  };
 223  228  
 224  229  #define VDEV_RAIDZ_MAXPARITY    3
 225  230  
 226  231  #define VDEV_PAD_SIZE           (8 << 10)
 227  232  /* 2 padding areas (vl_pad1 and vl_pad2) to skip */
 228  233  #define VDEV_SKIP_SIZE          VDEV_PAD_SIZE * 2
 229  234  #define VDEV_PHYS_SIZE          (112 << 10)
 230  235  #define VDEV_UBERBLOCK_RING     (128 << 10)
 231  236  
 232  237  #define VDEV_UBERBLOCK_SHIFT(vd)        \
 233  238          MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT)
 234  239  #define VDEV_UBERBLOCK_COUNT(vd)        \
 235  240          (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
 236  241  #define VDEV_UBERBLOCK_OFFSET(vd, n)    \
 237  242          offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)])
 238  243  #define VDEV_UBERBLOCK_SIZE(vd)         (1ULL << VDEV_UBERBLOCK_SHIFT(vd))
 239  244  
 240  245  typedef struct vdev_phys {
 241  246          char            vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)];
 242  247          zio_eck_t       vp_zbt;
 243  248  } vdev_phys_t;
 244  249  
 245  250  typedef struct vdev_label {
 246  251          char            vl_pad1[VDEV_PAD_SIZE];                 /*  8K */
 247  252          char            vl_pad2[VDEV_PAD_SIZE];                 /*  8K */
 248  253          vdev_phys_t     vl_vdev_phys;                           /* 112K */
 249  254          char            vl_uberblock[VDEV_UBERBLOCK_RING];      /* 128K */
 250  255  } vdev_label_t;                                                 /* 256K total */
 251  256  
 252  257  /*
 253  258   * vdev_dirty() flags
 254  259   */
 255  260  #define VDD_METASLAB    0x01
 256  261  #define VDD_DTL         0x02
 257  262  
 258  263  /* Offset of embedded boot loader region on each label */
 259  264  #define VDEV_BOOT_OFFSET        (2 * sizeof (vdev_label_t))
 260  265  /*
 261  266   * Size of embedded boot loader region on each label.
 262  267   * The total size of the first two labels plus the boot area is 4MB.
 263  268   */
 264  269  #define VDEV_BOOT_SIZE          (7ULL << 19)                    /* 3.5M */
 265  270  
 266  271  /*
 267  272   * Size of label regions at the start and end of each leaf device.
 268  273   */
 269  274  #define VDEV_LABEL_START_SIZE   (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE)
 270  275  #define VDEV_LABEL_END_SIZE     (2 * sizeof (vdev_label_t))
 271  276  #define VDEV_LABELS             4
 272  277  #define VDEV_BEST_LABEL         VDEV_LABELS
 273  278  
 274  279  #define VDEV_ALLOC_LOAD         0
 275  280  #define VDEV_ALLOC_ADD          1
 276  281  #define VDEV_ALLOC_SPARE        2
 277  282  #define VDEV_ALLOC_L2CACHE      3
 278  283  #define VDEV_ALLOC_ROOTPOOL     4
 279  284  #define VDEV_ALLOC_SPLIT        5
 280  285  #define VDEV_ALLOC_ATTACH       6
 281  286  
 282  287  /*
 283  288   * Allocate or free a vdev
 284  289   */
 285  290  extern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid,
 286  291      vdev_ops_t *ops);
 287  292  extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config,
 288  293      vdev_t *parent, uint_t id, int alloctype);
 289  294  extern void vdev_free(vdev_t *vd);
 290  295  
 291  296  /*
 292  297   * Add or remove children and parents
 293  298   */
 294  299  extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd);
 295  300  extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd);
 296  301  extern void vdev_compact_children(vdev_t *pvd);
 297  302  extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops);
 298  303  extern void vdev_remove_parent(vdev_t *cvd);
 299  304  
 300  305  /*
 301  306   * vdev sync load and sync
 302  307   */
 303  308  extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd);
 304  309  extern boolean_t vdev_log_state_valid(vdev_t *vd);
 305  310  extern void vdev_load(vdev_t *vd);
 306  311  extern int vdev_dtl_load(vdev_t *vd);
 307  312  extern void vdev_sync(vdev_t *vd, uint64_t txg);
 308  313  extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
 309  314  extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg);
 310  315  extern void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg);
 311  316  
 312  317  /*
 313  318   * Available vdev types.
 314  319   */
 315  320  extern vdev_ops_t vdev_root_ops;
 316  321  extern vdev_ops_t vdev_mirror_ops;
 317  322  extern vdev_ops_t vdev_replacing_ops;
 318  323  extern vdev_ops_t vdev_raidz_ops;
 319  324  extern vdev_ops_t vdev_disk_ops;
 320  325  extern vdev_ops_t vdev_file_ops;
 321  326  extern vdev_ops_t vdev_missing_ops;
 322  327  extern vdev_ops_t vdev_hole_ops;
 323  328  extern vdev_ops_t vdev_spare_ops;
 324  329  
 325  330  /*
 326  331   * Common size functions
 327  332   */
 328  333  extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
 329  334  extern uint64_t vdev_get_min_asize(vdev_t *vd);
 330  335  extern void vdev_set_min_asize(vdev_t *vd);
 331  336  
 332  337  /*
 333  338   * Global variables
 334  339   */
 335  340  /* zdb uses this tunable, so it must be declared here to make lint happy. */
 336  341  extern int zfs_vdev_cache_size;
 337  342  
 338  343  /*
 339  344   * The vdev_buf_t is used to translate between zio_t and buf_t, and back again.
 340  345   */
 341  346  typedef struct vdev_buf {
 342  347          buf_t   vb_buf;         /* buffer that describes the io */
 343  348          zio_t   *vb_io;         /* pointer back to the original zio_t */
 344  349  } vdev_buf_t;
 345  350  
 346  351  #ifdef  __cplusplus
 347  352  }
 348  353  #endif
 349  354  
 350  355  #endif  /* _SYS_VDEV_IMPL_H */
  
    | 
      ↓ open down ↓ | 
    130 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX