1 # CDDL HEADER START
2 #
3 # The contents of this file are subject to the terms of the
4 # Common Development and Distribution License (the "License").
5 # You may not use this file except in compliance with the License.
6 #
7 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
8 # or http://www.opensolaris.org/os/licensing.
9 # See the License for the specific language governing permissions
10 # and limitations under the License.
11 #
12 # When distributing Covered Code, include this CDDL HEADER in each
13 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
14 # If applicable, add the following below this CDDL HEADER, with the
15 # fields enclosed by brackets "[]" replaced with your own identifying
16 # information: Portions Copyright [yyyy] [name of copyright owner]
17 #
18 # CDDL HEADER END
19 #
20 # Copyright 2008 Sun Microsystems, Inc. All rights reserved.
21 # Use is subject to license terms.
22 #
23 # $Id: CACHE_SPEC,v 3.6.0.0 1998/01/05 22:55:19 idumois Exp $
24 #
25
26 "sd" cache layer
27 ----------------
28 #include <sys/sd/sd.h>
29
30 The "sd" layer provides a common interface to the functionality
31 described below. It will also allow switching to a direct to disk
32 version, so that a new cache module could be loaded.
33 The functions are basically the same as those below,
34 but named without the leading underscore.
35 (ie sd_alloc_buf instead of _sd_alloc_buf)
36
37
38 "sdbc" -- storage device block cache (aka blkc)
39 -----------------------------------------------
40
41 #include "uts/sd/sdbc/sd_cache.h" /* for SDBC interface */
42 #include "sys/sd/sd.h" /* for generic SD interface */
43
44 (all interaction is in terms of the buf_handle.
45
46 Currently buf_handle is declared as:
47
48 #define _SD_MAX_BLKS 64
49 #define _SD_MAX_FBAS (_SD_MAX_BLKS << FBA_SHFT)
50
51 typedef struct _sd_buf_handle {
52 int bh_cd; /* actually bh_buf.sb_cd */
53 int bh_fba_pos; /* bh_buf.sb_pos */
54 int bh_fba_len; /* bh_buf.sb_len */
55 int bh_flag; /* bh_buf.sb_flag */
56 int bh_error; /* bh_buf.sb_error */
57 _sd_vec_t bh_bufvec[_SD_MAX_BLKS]; /* bh_buf.sb_vec */
58 void (*bh_disconnect_cb)();
59 void (*bh_read_cb)();
60 void (*bh_write_cb)();
61 ......
62 } _sd_buf_handle_t;
63
64
65 typedef struct sd_vec_s { /* Scatter gather element */
66 unsigned char *sv_addr; /* Virtual address of data */
67 unsigned int sv_vme; /* VME address of data */
68 int sv_len; /* Data length in bytes */
69 } sd_vec_t;
70
71 The upper level routines should reference only: handle->bh_error,
72 handle->bh_bufvec The bh_bufvec is an array of _sd_vec_t with the
73 last item in the array having a NULL bufaddr.
74
75 IMPORTANT: The handle should be treated read-only and never be modified.
76
77 1) Multiple accesses to a single file will be supported.
78 (Side effect: If a process owning cache blocks of a files attempts
79 to allocate overlapping cache blocks, it will be a
80 deadlock condition.)
81
82 2) Multiple writes to an allocated block will be supported. It
83 is no longer necessary to free and re-allocate between writes.
84
85 3) _SD_NOBLOCK is equivalent of async_io -- the io will be initiated
86 if required with the call returning _SD_PENDING. A callback
87 (read or write) will be called at io end action.
88
89 4) Disconnect hints to ckd will be provided by the use of
90 either psema or thread_bind() when io needs to be initiated.
91
92
93 NOTE:
94 fba_pos = disk block number, each block being 512 bytes.
95 fba_len = len in disk blocks, each block being 512 bytes.
96 Thus, 512 bytes = 1 fba_len, 1024 = 2 fba_len etc...
97
98 Hints:
99 _SD_WRTHRU: write through mode.
100 This hint can be set on a node, a device or per access.
101 _SD_FORCED_WRTHRU: forced write through (node down or flow control)
102 If this hint is cleared, when only one node is up,
103 _sd_uncommit() will not work properly, and a second
104 failure could result in lost data.
105 This is a node hint.
106 _SD_NOCACHE: reuse cache blocks immediately instead of keeping
107 in lru order.
108 This hint can be set on a device or per access.
109
110 Interface:
111
112 _sd_buf_handle_t *
113 _sd_alloc_handle(discon_cb, read_cb, write_cb)
114 void (*discon_cb)();
115 void (*read_cb)();
116 void (*write_cb)();
117
118 The callbacks can be NULL if you do not want any callbacks.
119 Else, the callbacks will be stored in the handle, and will be
120 called at specific points in the cache. (Its up to the
121 callback to do what is necessary, including disconnecting
122 from the channel)
123
124 Usage: for better performance, an application could allocate
125 a handle (or as many handles as is required) upfront and
126 use it later on in the cache calls.
127
128 Not allocating and managing the handles would mean a new
129 handle will be allocated and freed during _sd_alloc_buf
130 and _sd_freebuf.
131
132 int
133 _sd_free_handle(handle)
134 _sd_buf_handle_t *handle;
135
136 Only handles that are allocated through _sd_alloc_handle
137 should be freed with this call.
138
139 int
140 _sd_alloc_buf (cd, fba_pos, fba_len, flag, handle_p)
141 int cd;
142 int fba_pos;
143 int fba_len;
144 int flag;
145 _sd_buf_handle_t **handle_p;
146
147 cd = cache descriptor. Results in an error if this node does
148 not own this disk and the other node has not crashed.
149 (ie. requests must be routed to the correct node)
150 (see fault tolerant aspects discussed elsewhere)
151
152 fba_pos = disk position in multiples of 512 byte blocks.
153 fba_len = length in multiples of 512 bytes blocks.
154 (NOTE: This cannot exceed _SD_MAX_FBAS)
155
156 flag = None, one or more of the following (described below):
157 _SD_RDBUF | SD_WRBUF | _SD_RDWRBUF | _SD_PINNABLE |
158 _SD_NOBLOCK | _SD_NOCACHE | _SD_WRTHRU
159
160 handle_p = (*handle_p = handle to be used for this call)
161 If *handle_p == NULL, a new handle will be
162 allocated. _sd_free_buf will free up any handles
163 allocated in this fashion.
164 NOTE: Handles allocated in this fashion will not have
165 any callbacks registered in them. As such,
166 _SD_NOBLOCK flag along with a NULL handle would
167 result in the io being lost.
168
169 return: Error number if > 0
170 possible errors:
171 EINVAL if arguments are incorrect or
172 cache not initialized or
173 device not open.
174 E2BIG if this request is a read and such a large
175 request cannot be currently satisfied. (break up
176 the io or re-issue at a later point)
177 EIO or any other errno that the driver might return.
178 Note: on error, the handle is not active,
179 and also is freed if *handle_p was NULL.
180
181 if 0 or less, status will be one of:
182 _SD_DONE: buffer is ready, and ready to be used.
183 (with the blocks valid if _SD_RDBUF is set)
184 _SD_PENDING:
185 read callback, if one has been registered in the handle,
186 will be called to complete this request.
187 _SD_HIT: Same as _SD_DONE, read was satisfied by cache,
188 or no blocking required for write buffer.
189
190 Note: _SD_RDBUF will issue the read if necessary.
191 _SD_WRBUF allocates a network address to reflect to
192 mirror node on _sd_write().
193 ~_SD_RDBUF allocates buffers but does NOT pre-read;
194 use _sd_read() to fill in (portions) as req'd.
195
196 Note: flag == (_SD_RDBUF|_SD_WRTHRU|_SD_NOCACHE) will
197 clear valid bits (that are not dirty) thus read direct
198 from disk, without requiring a hash invalidate.
199
200
201 int
202 _sd_write (handle, fba_pos, fba_len, flag)
203 _sd_buf_handle_t *handle;
204 int fba_pos, fba_len;
205 int flag;
206 {
207 handle = handle previously allocated in allocate buf.
208 fba_pos and fba_len have to be within the allocated portion.
209 int flag. Flag: _SD_NOBLOCK | SD_WRTHRU
210
211 Attempting to write to a handle that was not allocated for write
212 will return error (EINVAL)
213
214 returns: errno if return > 0
215 if 0 or less, return will be one of:
216 _SD_PENDING: will be returned only if _SD_NOBLOCK is set AND
217 either the flag is _SD_WRTHRU or the other node is down,
218 or the device/node is in write through mode
219 _SD_DONE: is returned if the block has been written to the disk.
220 _SD_HIT: write block in cache..
221
222 int
223 _sd_read (handle, fba_pos, fba_len, flag)
224 _sd_buf_handle_t *handle;
225 int fba_pos, fba_len;
226 int flag;
227
228 handle = handle previously allocated in allocate buf.
229 fba_pos and fba_len have to be within the allocated portion.
230 int flag. Flag: _SD_NOBLOCK
231
232 returns: errno if return > 0
233 error E2BIG if this request is big and cannot be currently
234 satisfied. (break up the io or re-issue at a later point)
235
236 if 0 or less, return will be one of:
237 _SD_PENDING: will be returned only if _SD_NOBLOCK is set and
238 we need to do an io.
239 _SD_HIT: is returned if the blocks were satisfied by cache.
240 _SD_DONE: some blocks were read from disk.
241
242 int
243 _sd_uncommit(handle, fba_pos, fba_len, flag)
244 _sd_buf_handle_t *handle;
245 int fba_pos, fba_len;
246 int flag;
247
248 handle = handle previously allocated in allocate buf.
249 fba_pos and fba_len have to be within the allocated portion.
250 flag: reserved for future use.
251
252 _sd_uncommit could block and cannot be called from a
253 "non-blocking" context.
254 (This is under review, from the ckd point of view)
255
256 returns 0 (_SD_DONE) else errno;
257
258
259 int
260 _sd_zero (handle, fba_pos, fba_len, flag)
261 _sd_buf_handle_t *handle;
262 int fba_pos, fba_len;
263 int flag;
264
265 handle = handle previously allocated in allocate buf.
266 fba_pos and fba_len have to be within the allocated portion.
267 zero the buffer described by the handle.
268 flag: _SD_NOBLOCK | _SD_WRTHRU
269
270 The call commits data to disk.
271 This call has characteristics similar to _sd_write.
272
273 returns: errno if return > 0
274 if 0 or less, return will be one of:
275 _SD_DONE
276 _SD_PENDING
277
278 _sd_copy (handle1, handle2, fba_pos1, fba_pos2, fba_len)
279 _sd_buf_handle_t *handle1, handle2;
280 int fba_pos1, fba_pos2, fba_len;
281
282 Copies relevant data from handle1 to handle2.
283 Useful for mirroring, remote dual copy, backup while open,
284 in-house tests, etc.
285
286 This call does not commit data to disk - you must explicitly
287 call _sd_write() on handle2 if that is what you want.
288
289 returns: errno if return > 0:
290 EIO - if sd module should do a generic bcopy
291 others - real error (passed to user)
292 if 0 or less, return will be:
293 _SD_DONE - sucess
294
295 _sd_free_buf(handle)
296 _sd_buf_handle_t *handle;
297
298 handle = handle previously allocated in allocate buf.
299
300 returns 0 (_SD_DONE) else errno;
301
302 _sd_open(filename, flag)
303 char *filename;
304 int flag;
305
306 returns a cache descriptor, or negative error number.
307 Typically use _sd_attach_cd(cd) before accessing the device.
308 Note: if devices is already open, it returns the same cache descriptor.
309 Currently there is no reference count; so one _sd_close() closes
310 the cache descriptor (in all contexts).
311
312 _sd_close(cd)
313 int cd;
314 Similar to _sd_detach_cd below.
315 Note: intended to be called when terminating the cache; and not during
316 normal operation. No reference count (see above).
317 Returns: 0 success, EIO.
318
319 _sd_detach_cd(cd)
320 re-reflect any pinned blocks to the other side,
321 or wait for writes to flush; and invalidate that device's hash entries,
322 and relinquish device responsibility.
323 Returns: 0 success, EIO, EAGAIN.
324
325 _sd_attach_cd(cd)
326 If device has pinned blocks then scan for and re-pin those blocks
327 (same idea as "node recovery" process, but per-device);
328 and assert device responsibility.
329
330 _sd_notify_all_pin(cd)
331 rescan list of failed blocks and re-issue the pinned callback to
332 simulation.
333
334
335 _sd_register_pinned(func)
336 void (*func)();
337 callback (*func)(cd, fba_pos, fba_len) when disk write fails,
338 and _SD_PENDING was specified on alloc.
339
340 _sd_register_unpinned(func)
341 void (*func)();
342 callback (*func)(cd, fba_pos, fba_len) when data previously pinned
343 is successfully written to disk.
344
345 _sd_register_down(func)
346 void (*func)();
347 callback (*func)() when health monitor detects the other node went down.
348
349 _sd_set_hint(cd, hint)
350 _sd_clear_hint(cd, hint)
351 _sd_get_cd_hint(cd, &hint)
352 _sd_set_node_hint(hint)
353 _sd_clear_node_hint(hint)
354 _sd_get_node_hint(&hint)
355
356 where hint is _SD_NOCACHE and _SD_WRTHRU. (Write through being synchronous
357 write and will be the default if the second node dies.)
358
359 _SD_NOCACHE: hint indicating that the current access need not be
360 cached for later consumption.
361
362
363 _sd_discard_pinned(cd, fba_pos, fba_len)
364 call from ckd into cache, called when data that was earlier
365 on pinned can be discarded from the cache.
366
367 returns: 0 or error.
368 (error = EINVAL if the discard could not be done)
369
370 (note: there is an inherent race between the unpinned callback and
371 _sd_discard_pinned which could put the data on disk in an inconsistent
372 state)
373
374
375 Failover support:
376
377 The Nodedown callback will be called, if one has been registered. This
378 will happen as soon as the other node has been detected to have gone down,
379 or when the cache is disabled on the other node.
380
381 The amount of time to for this callback to happen after the node goes down
382 is not deterministic.
383
384 Access to a mirror node's devices is only valid from the point the
385 nodedown callback is called till the other node is determined to be back
386 in operation.
387
388 Access to mirror node's devices while recovery is in progress will
389 block the access till the recovery is complete.