1 # CDDL HEADER START 2 # 3 # The contents of this file are subject to the terms of the 4 # Common Development and Distribution License (the "License"). 5 # You may not use this file except in compliance with the License. 6 # 7 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 8 # or http://www.opensolaris.org/os/licensing. 9 # See the License for the specific language governing permissions 10 # and limitations under the License. 11 # 12 # When distributing Covered Code, include this CDDL HEADER in each 13 # file and include the License file at usr/src/OPENSOLARIS.LICENSE. 14 # If applicable, add the following below this CDDL HEADER, with the 15 # fields enclosed by brackets "[]" replaced with your own identifying 16 # information: Portions Copyright [yyyy] [name of copyright owner] 17 # 18 # CDDL HEADER END 19 # 20 # Copyright 2008 Sun Microsystems, Inc. All rights reserved. 21 # Use is subject to license terms. 22 # 23 # $Id: CACHE_SPEC,v 3.6.0.0 1998/01/05 22:55:19 idumois Exp $ 24 # 25 26 "sd" cache layer 27 ---------------- 28 #include <sys/sd/sd.h> 29 30 The "sd" layer provides a common interface to the functionality 31 described below. It will also allow switching to a direct to disk 32 version, so that a new cache module could be loaded. 33 The functions are basically the same as those below, 34 but named without the leading underscore. 35 (ie sd_alloc_buf instead of _sd_alloc_buf) 36 37 38 "sdbc" -- storage device block cache (aka blkc) 39 ----------------------------------------------- 40 41 #include "uts/sd/sdbc/sd_cache.h" /* for SDBC interface */ 42 #include "sys/sd/sd.h" /* for generic SD interface */ 43 44 (all interaction is in terms of the buf_handle. 45 46 Currently buf_handle is declared as: 47 48 #define _SD_MAX_BLKS 64 49 #define _SD_MAX_FBAS (_SD_MAX_BLKS << FBA_SHFT) 50 51 typedef struct _sd_buf_handle { 52 int bh_cd; /* actually bh_buf.sb_cd */ 53 int bh_fba_pos; /* bh_buf.sb_pos */ 54 int bh_fba_len; /* bh_buf.sb_len */ 55 int bh_flag; /* bh_buf.sb_flag */ 56 int bh_error; /* bh_buf.sb_error */ 57 _sd_vec_t bh_bufvec[_SD_MAX_BLKS]; /* bh_buf.sb_vec */ 58 void (*bh_disconnect_cb)(); 59 void (*bh_read_cb)(); 60 void (*bh_write_cb)(); 61 ...... 62 } _sd_buf_handle_t; 63 64 65 typedef struct sd_vec_s { /* Scatter gather element */ 66 unsigned char *sv_addr; /* Virtual address of data */ 67 unsigned int sv_vme; /* VME address of data */ 68 int sv_len; /* Data length in bytes */ 69 } sd_vec_t; 70 71 The upper level routines should reference only: handle->bh_error, 72 handle->bh_bufvec The bh_bufvec is an array of _sd_vec_t with the 73 last item in the array having a NULL bufaddr. 74 75 IMPORTANT: The handle should be treated read-only and never be modified. 76 77 1) Multiple accesses to a single file will be supported. 78 (Side effect: If a process owning cache blocks of a files attempts 79 to allocate overlapping cache blocks, it will be a 80 deadlock condition.) 81 82 2) Multiple writes to an allocated block will be supported. It 83 is no longer necessary to free and re-allocate between writes. 84 85 3) _SD_NOBLOCK is equivalent of async_io -- the io will be initiated 86 if required with the call returning _SD_PENDING. A callback 87 (read or write) will be called at io end action. 88 89 4) Disconnect hints to ckd will be provided by the use of 90 either psema or thread_bind() when io needs to be initiated. 91 92 93 NOTE: 94 fba_pos = disk block number, each block being 512 bytes. 95 fba_len = len in disk blocks, each block being 512 bytes. 96 Thus, 512 bytes = 1 fba_len, 1024 = 2 fba_len etc... 97 98 Hints: 99 _SD_WRTHRU: write through mode. 100 This hint can be set on a node, a device or per access. 101 _SD_FORCED_WRTHRU: forced write through (node down or flow control) 102 If this hint is cleared, when only one node is up, 103 _sd_uncommit() will not work properly, and a second 104 failure could result in lost data. 105 This is a node hint. 106 _SD_NOCACHE: reuse cache blocks immediately instead of keeping 107 in lru order. 108 This hint can be set on a device or per access. 109 110 Interface: 111 112 _sd_buf_handle_t * 113 _sd_alloc_handle(discon_cb, read_cb, write_cb) 114 void (*discon_cb)(); 115 void (*read_cb)(); 116 void (*write_cb)(); 117 118 The callbacks can be NULL if you do not want any callbacks. 119 Else, the callbacks will be stored in the handle, and will be 120 called at specific points in the cache. (Its up to the 121 callback to do what is necessary, including disconnecting 122 from the channel) 123 124 Usage: for better performance, an application could allocate 125 a handle (or as many handles as is required) upfront and 126 use it later on in the cache calls. 127 128 Not allocating and managing the handles would mean a new 129 handle will be allocated and freed during _sd_alloc_buf 130 and _sd_freebuf. 131 132 int 133 _sd_free_handle(handle) 134 _sd_buf_handle_t *handle; 135 136 Only handles that are allocated through _sd_alloc_handle 137 should be freed with this call. 138 139 int 140 _sd_alloc_buf (cd, fba_pos, fba_len, flag, handle_p) 141 int cd; 142 int fba_pos; 143 int fba_len; 144 int flag; 145 _sd_buf_handle_t **handle_p; 146 147 cd = cache descriptor. Results in an error if this node does 148 not own this disk and the other node has not crashed. 149 (ie. requests must be routed to the correct node) 150 (see fault tolerant aspects discussed elsewhere) 151 152 fba_pos = disk position in multiples of 512 byte blocks. 153 fba_len = length in multiples of 512 bytes blocks. 154 (NOTE: This cannot exceed _SD_MAX_FBAS) 155 156 flag = None, one or more of the following (described below): 157 _SD_RDBUF | SD_WRBUF | _SD_RDWRBUF | _SD_PINNABLE | 158 _SD_NOBLOCK | _SD_NOCACHE | _SD_WRTHRU 159 160 handle_p = (*handle_p = handle to be used for this call) 161 If *handle_p == NULL, a new handle will be 162 allocated. _sd_free_buf will free up any handles 163 allocated in this fashion. 164 NOTE: Handles allocated in this fashion will not have 165 any callbacks registered in them. As such, 166 _SD_NOBLOCK flag along with a NULL handle would 167 result in the io being lost. 168 169 return: Error number if > 0 170 possible errors: 171 EINVAL if arguments are incorrect or 172 cache not initialized or 173 device not open. 174 E2BIG if this request is a read and such a large 175 request cannot be currently satisfied. (break up 176 the io or re-issue at a later point) 177 EIO or any other errno that the driver might return. 178 Note: on error, the handle is not active, 179 and also is freed if *handle_p was NULL. 180 181 if 0 or less, status will be one of: 182 _SD_DONE: buffer is ready, and ready to be used. 183 (with the blocks valid if _SD_RDBUF is set) 184 _SD_PENDING: 185 read callback, if one has been registered in the handle, 186 will be called to complete this request. 187 _SD_HIT: Same as _SD_DONE, read was satisfied by cache, 188 or no blocking required for write buffer. 189 190 Note: _SD_RDBUF will issue the read if necessary. 191 _SD_WRBUF allocates a network address to reflect to 192 mirror node on _sd_write(). 193 ~_SD_RDBUF allocates buffers but does NOT pre-read; 194 use _sd_read() to fill in (portions) as req'd. 195 196 Note: flag == (_SD_RDBUF|_SD_WRTHRU|_SD_NOCACHE) will 197 clear valid bits (that are not dirty) thus read direct 198 from disk, without requiring a hash invalidate. 199 200 201 int 202 _sd_write (handle, fba_pos, fba_len, flag) 203 _sd_buf_handle_t *handle; 204 int fba_pos, fba_len; 205 int flag; 206 { 207 handle = handle previously allocated in allocate buf. 208 fba_pos and fba_len have to be within the allocated portion. 209 int flag. Flag: _SD_NOBLOCK | SD_WRTHRU 210 211 Attempting to write to a handle that was not allocated for write 212 will return error (EINVAL) 213 214 returns: errno if return > 0 215 if 0 or less, return will be one of: 216 _SD_PENDING: will be returned only if _SD_NOBLOCK is set AND 217 either the flag is _SD_WRTHRU or the other node is down, 218 or the device/node is in write through mode 219 _SD_DONE: is returned if the block has been written to the disk. 220 _SD_HIT: write block in cache.. 221 222 int 223 _sd_read (handle, fba_pos, fba_len, flag) 224 _sd_buf_handle_t *handle; 225 int fba_pos, fba_len; 226 int flag; 227 228 handle = handle previously allocated in allocate buf. 229 fba_pos and fba_len have to be within the allocated portion. 230 int flag. Flag: _SD_NOBLOCK 231 232 returns: errno if return > 0 233 error E2BIG if this request is big and cannot be currently 234 satisfied. (break up the io or re-issue at a later point) 235 236 if 0 or less, return will be one of: 237 _SD_PENDING: will be returned only if _SD_NOBLOCK is set and 238 we need to do an io. 239 _SD_HIT: is returned if the blocks were satisfied by cache. 240 _SD_DONE: some blocks were read from disk. 241 242 int 243 _sd_uncommit(handle, fba_pos, fba_len, flag) 244 _sd_buf_handle_t *handle; 245 int fba_pos, fba_len; 246 int flag; 247 248 handle = handle previously allocated in allocate buf. 249 fba_pos and fba_len have to be within the allocated portion. 250 flag: reserved for future use. 251 252 _sd_uncommit could block and cannot be called from a 253 "non-blocking" context. 254 (This is under review, from the ckd point of view) 255 256 returns 0 (_SD_DONE) else errno; 257 258 259 int 260 _sd_zero (handle, fba_pos, fba_len, flag) 261 _sd_buf_handle_t *handle; 262 int fba_pos, fba_len; 263 int flag; 264 265 handle = handle previously allocated in allocate buf. 266 fba_pos and fba_len have to be within the allocated portion. 267 zero the buffer described by the handle. 268 flag: _SD_NOBLOCK | _SD_WRTHRU 269 270 The call commits data to disk. 271 This call has characteristics similar to _sd_write. 272 273 returns: errno if return > 0 274 if 0 or less, return will be one of: 275 _SD_DONE 276 _SD_PENDING 277 278 _sd_copy (handle1, handle2, fba_pos1, fba_pos2, fba_len) 279 _sd_buf_handle_t *handle1, handle2; 280 int fba_pos1, fba_pos2, fba_len; 281 282 Copies relevant data from handle1 to handle2. 283 Useful for mirroring, remote dual copy, backup while open, 284 in-house tests, etc. 285 286 This call does not commit data to disk - you must explicitly 287 call _sd_write() on handle2 if that is what you want. 288 289 returns: errno if return > 0: 290 EIO - if sd module should do a generic bcopy 291 others - real error (passed to user) 292 if 0 or less, return will be: 293 _SD_DONE - sucess 294 295 _sd_free_buf(handle) 296 _sd_buf_handle_t *handle; 297 298 handle = handle previously allocated in allocate buf. 299 300 returns 0 (_SD_DONE) else errno; 301 302 _sd_open(filename, flag) 303 char *filename; 304 int flag; 305 306 returns a cache descriptor, or negative error number. 307 Typically use _sd_attach_cd(cd) before accessing the device. 308 Note: if devices is already open, it returns the same cache descriptor. 309 Currently there is no reference count; so one _sd_close() closes 310 the cache descriptor (in all contexts). 311 312 _sd_close(cd) 313 int cd; 314 Similar to _sd_detach_cd below. 315 Note: intended to be called when terminating the cache; and not during 316 normal operation. No reference count (see above). 317 Returns: 0 success, EIO. 318 319 _sd_detach_cd(cd) 320 re-reflect any pinned blocks to the other side, 321 or wait for writes to flush; and invalidate that device's hash entries, 322 and relinquish device responsibility. 323 Returns: 0 success, EIO, EAGAIN. 324 325 _sd_attach_cd(cd) 326 If device has pinned blocks then scan for and re-pin those blocks 327 (same idea as "node recovery" process, but per-device); 328 and assert device responsibility. 329 330 _sd_notify_all_pin(cd) 331 rescan list of failed blocks and re-issue the pinned callback to 332 simulation. 333 334 335 _sd_register_pinned(func) 336 void (*func)(); 337 callback (*func)(cd, fba_pos, fba_len) when disk write fails, 338 and _SD_PENDING was specified on alloc. 339 340 _sd_register_unpinned(func) 341 void (*func)(); 342 callback (*func)(cd, fba_pos, fba_len) when data previously pinned 343 is successfully written to disk. 344 345 _sd_register_down(func) 346 void (*func)(); 347 callback (*func)() when health monitor detects the other node went down. 348 349 _sd_set_hint(cd, hint) 350 _sd_clear_hint(cd, hint) 351 _sd_get_cd_hint(cd, &hint) 352 _sd_set_node_hint(hint) 353 _sd_clear_node_hint(hint) 354 _sd_get_node_hint(&hint) 355 356 where hint is _SD_NOCACHE and _SD_WRTHRU. (Write through being synchronous 357 write and will be the default if the second node dies.) 358 359 _SD_NOCACHE: hint indicating that the current access need not be 360 cached for later consumption. 361 362 363 _sd_discard_pinned(cd, fba_pos, fba_len) 364 call from ckd into cache, called when data that was earlier 365 on pinned can be discarded from the cache. 366 367 returns: 0 or error. 368 (error = EINVAL if the discard could not be done) 369 370 (note: there is an inherent race between the unpinned callback and 371 _sd_discard_pinned which could put the data on disk in an inconsistent 372 state) 373 374 375 Failover support: 376 377 The Nodedown callback will be called, if one has been registered. This 378 will happen as soon as the other node has been detected to have gone down, 379 or when the cache is disabled on the other node. 380 381 The amount of time to for this callback to happen after the node goes down 382 is not deterministic. 383 384 Access to a mirror node's devices is only valid from the point the 385 nodedown callback is called till the other node is determined to be back 386 in operation. 387 388 Access to mirror node's devices while recovery is in progress will 389 block the access till the recovery is complete.