1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright (c) 2013, Joyent, Inc. All rights reserved.
14 */
15
16 /*
17 * Disk Lights Agent (FMA)
18 *
19 * This Fault Management Daemon (fmd) module periodically scans the topology
20 * tree, enumerates all disks with associated fault indicators, and then
21 * synchronises the fault status of resources in the FMA Resource Cache with
22 * the indicators. In short: it turns the fault light on for befallen disks.
23 *
24 * Presently, we recognise associated fault indicators for disks by looking
25 * for the following structure in the topology tree:
26 *
27 * /bay=N
28 * |
29 * +---- /disk=0 <---------------- our Disk
30 * |
31 * +---- /bay=N?indicator=fail <---- the Fault Light
32 * \---- /bay=N?indicator=ident
33 *
34 * That is: a DISK node will have a parent BAY; that BAY will itself have
35 * child Facility nodes, one of which will be called "fail". If any of the
36 * above does not hold, we simply do nothing for this disk.
37 */
38
39 #include <string.h>
40 #include <strings.h>
41 #include <libnvpair.h>
42 #include <fm/libtopo.h>
43 #include <fm/topo_list.h>
44 #include <fm/topo_hc.h>
45 #include <fm/fmd_api.h>
46 #include <sys/fm/protocol.h>
47
48
49 typedef struct disk_lights {
50 fmd_hdl_t *dl_fmd;
51 uint64_t dl_poll_interval;
52 uint64_t dl_coalesce_interval;
53 id_t dl_timer;
54 boolean_t dl_triggered;
55 } disk_lights_t;
56
57 static void disklights_topo(fmd_hdl_t *, topo_hdl_t *);
58 static void disklights_recv(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
59 const char *);
60 static void disklights_timeout(fmd_hdl_t *, id_t, void *);
61
62 static const fmd_hdl_ops_t fmd_ops = {
63 disklights_recv, /* fmdo_recv */
64 disklights_timeout, /* fmdo_timeout */
65 NULL, /* fmdo_close */
66 NULL, /* fmdo_stats */
67 NULL, /* fmdo_gc */
68 NULL, /* fmdo_send */
69 disklights_topo, /* fmdo_topo */
70 };
71
72 /*
73 * POLL_INTERVAL is the period after which we perform an unsolicited poll
74 * to ensure we remain in sync with reality.
75 */
76 #define DL_PROP_POLL_INTERVAL "poll-interval"
77
78 /*
79 * COALESCE_INTERVAL is how long we wait after we are trigged by either a
80 * topology change or a relevant list.* event, in order to allow a series
81 * of events to coalesce.
82 */
83 #define DL_PROP_COALESCE_INTERVAL "coalesce-interval"
84
85 static const fmd_prop_t fmd_props[] = {
86 { DL_PROP_POLL_INTERVAL, FMD_TYPE_TIME, "5min" },
87 { DL_PROP_COALESCE_INTERVAL, FMD_TYPE_TIME, "3s" },
88 { NULL, 0, NULL }
89 };
90
91 static const fmd_hdl_info_t fmd_info = {
92 "Disk Lights Agent",
93 "1.0",
94 &fmd_ops,
95 fmd_props
96 };
97
98 /*
99 * Fetch the Facility Node properties (name, type) from the FMRI
100 * for this node, or return -1 if we can't.
101 */
102 static int
103 get_facility_props(tnode_t *node, char **facname, char **factype)
104 {
105 int e;
106 nvlist_t *fmri, *fnvl;
107
108 if (topo_node_resource(node, &fmri, &e) != 0)
109 return (-1);
110
111 if (nvlist_lookup_nvlist(fmri, FM_FMRI_FACILITY, &fnvl) != 0)
112 return (-1);
113
114 if (nvlist_lookup_string(fnvl, FM_FMRI_FACILITY_NAME, facname) != 0)
115 return (-1);
116
117 if (nvlist_lookup_string(fnvl, FM_FMRI_FACILITY_TYPE, factype) != 0)
118 return (-1);
119
120 return (0);
121 }
122
123 typedef struct dl_fault_walk_inner {
124 char *fwi_name;
125 uint32_t fwi_mode;
126 } dl_fault_walk_inner_t;
127
128 static int
129 dl_fault_walk_inner(topo_hdl_t *thp, tnode_t *node, void *arg)
130 {
131 dl_fault_walk_inner_t *fwi = arg;
132 char *facname, *factype;
133 int err;
134
135 /*
136 * We're only interested in BAY children that are valid Facility Nodes.
137 */
138 if (topo_node_flags(node) != TOPO_NODE_FACILITY ||
139 get_facility_props(node, &facname, &factype) != 0)
140 return (TOPO_WALK_NEXT);
141
142 if (strcmp(fwi->fwi_name, facname) != 0)
143 return (TOPO_WALK_NEXT);
144
145 /*
146 * Attempt to set the LED mode appropriately. If this fails, give up
147 * and move on.
148 */
149 (void) topo_prop_set_uint32(node, TOPO_PGROUP_FACILITY, TOPO_LED_MODE,
150 TOPO_PROP_MUTABLE, fwi->fwi_mode, &err);
151
152 return (TOPO_WALK_NEXT);
153 }
154
155 static int
156 dl_fault_walk_outer(topo_hdl_t *thp, tnode_t *node, void *arg)
157 {
158 disk_lights_t *dl = arg;
159 dl_fault_walk_inner_t fwi;
160 tnode_t *pnode;
161 int err, has_fault;
162 nvlist_t *fmri = NULL;
163
164 bzero(&fwi, sizeof (fwi));
165
166 /*
167 * We are only looking for DISK nodes in the topology that have a parent
168 * BAY.
169 */
170 if (strcmp(DISK, topo_node_name(node)) != 0 ||
171 (pnode = topo_node_parent(node)) == NULL ||
172 strcmp(BAY, topo_node_name(pnode)) != 0)
173 return (TOPO_WALK_NEXT);
174
175 /*
176 * Check to see if the Resource this FMRI describes is Faulty:
177 */
178 if (topo_node_resource(node, &fmri, &err) != 0)
179 return (TOPO_WALK_NEXT);
180 has_fault = fmd_nvl_fmri_has_fault(dl->dl_fmd, fmri,
181 FMD_HAS_FAULT_RESOURCE, NULL);
182 nvlist_free(fmri);
183
184
185 /*
186 * Walk the children of this BAY and flush out our fault status if
187 * we find an appropriate indicator node.
188 */
189 fwi.fwi_name = "fail";
190 fwi.fwi_mode = has_fault ? TOPO_LED_STATE_ON : TOPO_LED_STATE_OFF;
191 (void) topo_node_child_walk(thp, pnode, dl_fault_walk_inner, &fwi);
192
193 return (TOPO_WALK_NEXT);
194 }
195
196 /*
197 * Walk all of the topology nodes looking for DISKs that match the structure
198 * described in the overview. Once we find them, check their fault status
199 * and update their fault indiciator accordingly.
200 */
201 static void
202 dl_examine_topo(disk_lights_t *dl)
203 {
204 int err;
205 topo_hdl_t *thp = NULL;
206 topo_walk_t *twp = NULL;
207
208 thp = fmd_hdl_topo_hold(dl->dl_fmd, TOPO_VERSION);
209 if ((twp = topo_walk_init(thp, FM_FMRI_SCHEME_HC, dl_fault_walk_outer,
210 dl, &err)) == NULL) {
211 fmd_hdl_error(dl->dl_fmd, "failed to get topology: %s\n",
212 topo_strerror(err));
213 goto out;
214 }
215
216 if (topo_walk_step(twp, TOPO_WALK_CHILD) == TOPO_WALK_ERR) {
217 fmd_hdl_error(dl->dl_fmd, "failed to walk topology: %s\n",
218 topo_strerror(err));
219 goto out;
220 }
221
222 out:
223 if (twp != NULL)
224 topo_walk_fini(twp);
225 if (thp != NULL)
226 fmd_hdl_topo_rele(dl->dl_fmd, thp);
227 }
228
229 static void
230 dl_trigger_enum(disk_lights_t *dl)
231 {
232 /*
233 * If we're already on the short-poll coalesce timer, then return
234 * immediately.
235 */
236 if (dl->dl_triggered == B_TRUE)
237 return;
238 dl->dl_triggered = B_TRUE;
239
240 /*
241 * Replace existing poll timer with coalesce timer:
242 */
243 if (dl->dl_timer != 0)
244 fmd_timer_remove(dl->dl_fmd, dl->dl_timer);
245 dl->dl_timer = fmd_timer_install(dl->dl_fmd, NULL, NULL,
246 dl->dl_coalesce_interval);
247 }
248
249 static void
250 disklights_timeout(fmd_hdl_t *hdl, id_t id, void *data)
251 {
252 disk_lights_t *dl = fmd_hdl_getspecific(hdl);
253
254 dl->dl_triggered = B_FALSE;
255
256 dl_examine_topo(dl);
257
258 /*
259 * Install the long-interval timer for the next poll.
260 */
261 dl->dl_timer = fmd_timer_install(hdl, NULL, NULL, dl->dl_poll_interval);
262 }
263
264 static void
265 disklights_topo(fmd_hdl_t *hdl, topo_hdl_t *thp)
266 {
267 disk_lights_t *dl = fmd_hdl_getspecific(hdl);
268
269 dl_trigger_enum(dl);
270 }
271
272 static void
273 disklights_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
274 const char *class)
275 {
276 disk_lights_t *dl = fmd_hdl_getspecific(hdl);
277
278 dl_trigger_enum(dl);
279 }
280
281 void
282 _fmd_init(fmd_hdl_t *hdl)
283 {
284 disk_lights_t *dl;
285
286 if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0)
287 return;
288
289 dl = fmd_hdl_zalloc(hdl, sizeof (*dl), FMD_SLEEP);
290 fmd_hdl_setspecific(hdl, dl);
291
292 /*
293 * Load Configuration:
294 */
295 dl->dl_fmd = hdl;
296 dl->dl_poll_interval = fmd_prop_get_int64(hdl, DL_PROP_POLL_INTERVAL);
297 dl->dl_coalesce_interval = fmd_prop_get_int64(hdl,
298 DL_PROP_COALESCE_INTERVAL);
299
300 /*
301 * Schedule the initial enumeration:
302 */
303 dl_trigger_enum(dl);
304 }
305
306 void
307 _fmd_fini(fmd_hdl_t *hdl)
308 {
309 disk_lights_t *dl = fmd_hdl_getspecific(hdl);
310
311 fmd_hdl_free(hdl, dl, sizeof (*dl));
312 }