1 # CDDL HEADER START
   2 #
   3 # The contents of this file are subject to the terms of the
   4 # Common Development and Distribution License (the "License").
   5 # You may not use this file except in compliance with the License.
   6 #
   7 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   8 # or http://www.opensolaris.org/os/licensing.
   9 # See the License for the specific language governing permissions
  10 # and limitations under the License.
  11 #
  12 # When distributing Covered Code, include this CDDL HEADER in each
  13 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  14 # If applicable, add the following below this CDDL HEADER, with the
  15 # fields enclosed by brackets "[]" replaced with your own identifying
  16 # information: Portions Copyright [yyyy] [name of copyright owner]
  17 #
  18 # CDDL HEADER END
  19 #
  20 # Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  21 # Use is subject to license terms.
  22 #
  23 # $Id: CACHE_SPEC,v 3.6.0.0 1998/01/05 22:55:19 idumois Exp $
  24 #
  25 
  26         "sd" cache layer
  27         ----------------
  28 #include <sys/sd/sd.h>
  29 
  30 The "sd" layer provides a common interface to the functionality
  31 described below.  It will also allow switching to a direct to disk
  32 version, so that a new cache module could be loaded.
  33 The functions are basically the same as those below,
  34 but named without the leading underscore.
  35 (ie sd_alloc_buf instead of _sd_alloc_buf)
  36 
  37 
  38         "sdbc" -- storage device block cache (aka blkc)
  39         -----------------------------------------------
  40 
  41 #include "uts/sd/sdbc/sd_cache.h"       /* for SDBC interface */
  42 #include "sys/sd/sd.h"                  /* for generic SD interface */
  43 
  44 (all interaction is in terms of the buf_handle.
  45 
  46 Currently buf_handle is declared as:
  47 
  48 #define _SD_MAX_BLKS    64
  49 #define _SD_MAX_FBAS    (_SD_MAX_BLKS << FBA_SHFT)
  50 
  51 typedef struct _sd_buf_handle {
  52         int bh_cd;              /* actually bh_buf.sb_cd */
  53         int bh_fba_pos;         /* bh_buf.sb_pos */
  54         int bh_fba_len;         /* bh_buf.sb_len */
  55         int bh_flag;            /* bh_buf.sb_flag */
  56         int bh_error;           /* bh_buf.sb_error */
  57         _sd_vec_t bh_bufvec[_SD_MAX_BLKS]; /* bh_buf.sb_vec */
  58         void (*bh_disconnect_cb)();
  59         void (*bh_read_cb)();
  60         void (*bh_write_cb)();
  61         ......
  62 } _sd_buf_handle_t;
  63 
  64 
  65 typedef struct sd_vec_s {               /* Scatter gather element */
  66         unsigned char   *sv_addr;       /* Virtual address of data */
  67         unsigned int    sv_vme;         /* VME address of data */
  68         int             sv_len;         /* Data length in bytes */
  69 } sd_vec_t;
  70 
  71 The upper level routines should reference only: handle->bh_error,
  72 handle->bh_bufvec The bh_bufvec is an array of _sd_vec_t with the
  73 last item in the array having a NULL bufaddr. 
  74 
  75 IMPORTANT: The handle should be treated read-only and never be modified.
  76 
  77         1) Multiple accesses to a single file will be supported. 
  78         (Side effect: If a process owning cache blocks of a files attempts 
  79         to allocate overlapping cache blocks, it will be a 
  80         deadlock condition.)
  81 
  82         2) Multiple writes to an allocated block will be supported. It
  83         is no longer necessary to free and re-allocate between writes.
  84 
  85         3) _SD_NOBLOCK is equivalent of async_io -- the io will be initiated
  86         if required with the call returning _SD_PENDING. A callback 
  87         (read or write) will be called at io end action.
  88 
  89         4) Disconnect hints to ckd will be provided by the use of
  90         either psema or thread_bind() when io needs to be initiated.
  91 
  92 
  93 NOTE:
  94         fba_pos = disk block number, each block being 512 bytes.
  95         fba_len = len in disk blocks, each block being 512 bytes.
  96                 Thus, 512 bytes = 1 fba_len, 1024 = 2 fba_len etc...
  97 
  98 Hints:
  99         _SD_WRTHRU: write through mode. 
 100                 This hint can be set on a node, a device or per access.
 101         _SD_FORCED_WRTHRU: forced write through (node down or flow control)
 102                 If this hint is cleared,  when only one node is up, 
 103                 _sd_uncommit() will not work properly, and a second
 104                 failure could result in lost data.
 105                 This is a node hint.
 106         _SD_NOCACHE: reuse cache blocks immediately instead of keeping
 107                 in lru order.
 108                 This hint can be set on a device or per access.
 109 
 110 Interface:
 111 
 112 _sd_buf_handle_t *
 113 _sd_alloc_handle(discon_cb, read_cb, write_cb)
 114         void (*discon_cb)();
 115         void (*read_cb)();
 116         void (*write_cb)();
 117 
 118         The callbacks can be NULL if you do not want any callbacks.
 119         Else, the callbacks will be stored in the handle, and will be 
 120         called at specific points in the cache. (Its up to  the
 121         callback to do what is necessary, including disconnecting
 122         from the channel)
 123 
 124         Usage: for better performance, an application could allocate
 125         a handle (or as many handles as is required)  upfront and
 126         use it later on in the cache calls.
 127 
 128         Not allocating and managing the handles would mean a new
 129         handle will be allocated and freed during _sd_alloc_buf
 130         and _sd_freebuf.
 131         
 132 int     
 133 _sd_free_handle(handle)
 134         _sd_buf_handle_t *handle;
 135 
 136         Only handles that are allocated through _sd_alloc_handle
 137         should be freed with this call.
 138         
 139 int 
 140 _sd_alloc_buf (cd, fba_pos, fba_len, flag, handle_p)
 141         int cd;
 142         int fba_pos;
 143         int fba_len;
 144         int flag;
 145         _sd_buf_handle_t **handle_p;  
 146  
 147         cd = cache descriptor. Results in an error if this node does
 148                 not own this disk and the other node has not crashed.
 149                 (ie. requests must be routed to the correct node)
 150                 (see fault tolerant aspects discussed elsewhere)
 151  
 152         fba_pos = disk position in multiples of 512 byte blocks.
 153         fba_len = length in multiples of 512 bytes blocks.
 154                 (NOTE: This cannot exceed _SD_MAX_FBAS)
 155  
 156         flag = None, one or more of the following (described below):
 157                 _SD_RDBUF | SD_WRBUF | _SD_RDWRBUF | _SD_PINNABLE |
 158                 _SD_NOBLOCK | _SD_NOCACHE | _SD_WRTHRU
 159  
 160         handle_p = (*handle_p = handle to be used for this call)
 161                 If *handle_p == NULL, a new handle will be
 162                 allocated. _sd_free_buf will free up any handles
 163                     allocated in this fashion.
 164                 NOTE: Handles allocated in this fashion will not have
 165                         any callbacks registered in them. As such, 
 166                         _SD_NOBLOCK flag along with a NULL handle would
 167                         result in the io being lost.
 168         
 169         return: Error number if > 0
 170                 possible errors:
 171                         EINVAL if arguments are incorrect or
 172                                 cache not initialized or
 173                                 device not open.
 174                         E2BIG if this request is a read and such a large
 175                         request cannot be currently satisfied. (break up
 176                         the io or re-issue at a later point)
 177                         EIO or any other errno that the driver might return.
 178                 Note: on error, the handle is not active,
 179                 and also is freed if *handle_p was NULL.
 180  
 181         if 0 or less, status will be one of:
 182            _SD_DONE: buffer is ready, and ready to be used.
 183                 (with the blocks valid if _SD_RDBUF is set)
 184            _SD_PENDING: 
 185                 read callback, if one has been registered in the handle, 
 186                 will be called to complete this request.
 187            _SD_HIT:  Same as _SD_DONE, read was satisfied by cache,
 188                 or no blocking required for write buffer.
 189 
 190         Note:   _SD_RDBUF will issue the read if necessary.
 191                 _SD_WRBUF allocates a network address to reflect to
 192                         mirror node on _sd_write().
 193                 ~_SD_RDBUF allocates buffers but does NOT pre-read;
 194                         use _sd_read() to fill in (portions) as req'd.
 195 
 196         Note:   flag == (_SD_RDBUF|_SD_WRTHRU|_SD_NOCACHE) will
 197                 clear valid bits (that are not dirty) thus read direct
 198                 from disk, without requiring a hash invalidate.
 199 
 200 
 201 int
 202 _sd_write (handle, fba_pos, fba_len, flag)
 203         _sd_buf_handle_t *handle;
 204         int fba_pos, fba_len;
 205         int flag;
 206 {
 207         handle = handle previously allocated in allocate buf.
 208           fba_pos and fba_len have to be within the allocated portion.
 209         int flag. Flag: _SD_NOBLOCK | SD_WRTHRU 
 210 
 211         Attempting to write to a handle that was not allocated for write 
 212         will return error (EINVAL)      
 213 
 214         returns:  errno if return > 0
 215         if 0 or less, return  will be one of:
 216            _SD_PENDING: will be returned only if _SD_NOBLOCK is set AND
 217                 either the flag is _SD_WRTHRU or the other node is down,
 218                 or the device/node is in write through mode
 219            _SD_DONE: is returned if the block has been written to the disk.
 220            _SD_HIT: write block in cache..
 221                 
 222 int
 223 _sd_read (handle, fba_pos, fba_len, flag)
 224         _sd_buf_handle_t *handle;
 225         int fba_pos, fba_len;
 226         int flag;
 227 
 228         handle = handle previously allocated in allocate buf.
 229           fba_pos and fba_len have to be within the allocated portion.
 230         int flag. Flag: _SD_NOBLOCK
 231 
 232         returns:  errno if return > 0
 233                 error E2BIG if this request is big and cannot be currently
 234                  satisfied. (break up the io or re-issue at a later point)
 235 
 236         if 0 or less, return  will be one of:
 237            _SD_PENDING: will be returned only if _SD_NOBLOCK is set and 
 238                 we need to do an io.
 239            _SD_HIT: is returned if the blocks were satisfied by cache.
 240            _SD_DONE: some blocks were read from disk.
 241 
 242 int
 243 _sd_uncommit(handle, fba_pos, fba_len, flag)
 244         _sd_buf_handle_t *handle;
 245         int fba_pos, fba_len;
 246         int flag;
 247 
 248         handle = handle previously allocated in allocate buf.
 249           fba_pos and fba_len have to be within the allocated portion.
 250         flag: reserved for future use.
 251 
 252         _sd_uncommit could block and cannot be called from a 
 253                 "non-blocking" context.
 254         (This is under review, from the ckd point of view)
 255 
 256         returns 0 (_SD_DONE) else errno;
 257 
 258 
 259 int
 260 _sd_zero (handle, fba_pos, fba_len, flag)
 261         _sd_buf_handle_t *handle;
 262         int fba_pos, fba_len;
 263         int flag;
 264 
 265         handle = handle previously allocated in allocate buf.
 266           fba_pos and fba_len have to be within the allocated portion.
 267         zero the buffer described by the handle.
 268         flag: _SD_NOBLOCK | _SD_WRTHRU
 269 
 270         The call commits data to disk. 
 271         This call has characteristics similar to _sd_write.
 272 
 273         returns: errno if return > 0
 274                 if 0 or less, return will be one of:
 275                 _SD_DONE
 276                 _SD_PENDING 
 277 
 278 _sd_copy (handle1, handle2, fba_pos1, fba_pos2, fba_len)
 279         _sd_buf_handle_t *handle1, handle2;
 280         int fba_pos1, fba_pos2, fba_len;
 281         
 282         Copies relevant data from handle1 to handle2.
 283         Useful for mirroring, remote dual copy, backup while open,
 284         in-house tests, etc.
 285 
 286         This call does not commit data to disk - you must explicitly
 287         call _sd_write() on handle2 if that is what you want.
 288 
 289         returns: errno if return > 0:
 290                          EIO - if sd module should do a generic bcopy
 291                          others - real error (passed to user)
 292                  if 0 or less, return will be:
 293                         _SD_DONE - sucess
 294 
 295 _sd_free_buf(handle)
 296         _sd_buf_handle_t *handle;
 297 
 298         handle = handle previously allocated in allocate buf.
 299 
 300         returns 0 (_SD_DONE) else errno;
 301 
 302 _sd_open(filename, flag)
 303         char *filename;
 304         int flag;
 305 
 306         returns a cache descriptor, or negative error number.
 307         Typically use _sd_attach_cd(cd) before accessing the device.
 308         Note: if devices is already open, it returns the same cache descriptor.
 309         Currently there is no reference count; so one _sd_close() closes
 310         the cache descriptor (in all contexts).
 311 
 312 _sd_close(cd)
 313         int cd;
 314         Similar to _sd_detach_cd below.
 315         Note: intended to be called when terminating the cache; and not during
 316         normal operation.  No reference count (see above).
 317         Returns: 0 success, EIO.
 318 
 319 _sd_detach_cd(cd)
 320         re-reflect any pinned blocks to the other side,
 321         or wait for writes to flush; and invalidate that device's hash entries,
 322         and relinquish device responsibility.
 323         Returns: 0 success, EIO, EAGAIN.
 324 
 325 _sd_attach_cd(cd)
 326         If device has pinned blocks then scan for and re-pin those blocks
 327         (same idea as "node recovery" process, but per-device);
 328         and assert device responsibility.
 329 
 330 _sd_notify_all_pin(cd)
 331         rescan list of failed blocks and re-issue the pinned callback to
 332         simulation. 
 333 
 334 
 335 _sd_register_pinned(func)
 336         void (*func)();
 337     callback (*func)(cd, fba_pos, fba_len) when disk write fails,
 338     and _SD_PENDING was specified on alloc. 
 339 
 340 _sd_register_unpinned(func)
 341         void (*func)();
 342     callback (*func)(cd, fba_pos, fba_len) when data previously pinned
 343     is successfully written to disk.
 344 
 345 _sd_register_down(func)
 346         void (*func)();
 347     callback (*func)() when health monitor detects the other node went down.
 348 
 349 _sd_set_hint(cd, hint) 
 350 _sd_clear_hint(cd, hint)
 351 _sd_get_cd_hint(cd, &hint)
 352 _sd_set_node_hint(hint)
 353 _sd_clear_node_hint(hint)
 354 _sd_get_node_hint(&hint)
 355 
 356     where hint is _SD_NOCACHE and _SD_WRTHRU. (Write through being synchronous
 357         write and will be the default if the second node dies.)
 358 
 359    _SD_NOCACHE: hint indicating that the current access need not be 
 360         cached for later consumption.
 361 
 362 
 363 _sd_discard_pinned(cd, fba_pos, fba_len)
 364         call from ckd into cache, called when data that was earlier 
 365         on pinned can be discarded from the cache.
 366 
 367         returns: 0 or error.
 368         (error = EINVAL if the discard could not be done)
 369 
 370 (note: there is an inherent race between the unpinned callback and 
 371 _sd_discard_pinned which could put the data on disk in an inconsistent
 372 state)
 373 
 374 
 375 Failover support:
 376 
 377 The Nodedown callback will be called, if one has been registered. This 
 378 will happen as soon as the other node has been detected to have gone down, 
 379 or when the cache is disabled on the other node.
 380 
 381 The amount of time to for this callback to happen after the node goes down
 382 is not deterministic.
 383 
 384 Access to a mirror node's devices is only valid from the point the 
 385 nodedown callback is called till the other node is determined to be back
 386 in operation.
 387 
 388 Access to mirror node's devices while recovery is in progress will 
 389 block the access till the recovery is complete.