1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 14 */ 15 16 /* 17 * Disk Lights Agent (FMA) 18 * 19 * This Fault Management Daemon (fmd) module periodically scans the topology 20 * tree, enumerates all disks with associated fault indicators, and then 21 * synchronises the fault status of resources in the FMA Resource Cache with 22 * the indicators. In short: it turns the fault light on for befallen disks. 23 * 24 * Presently, we recognise associated fault indicators for disks by looking 25 * for the following structure in the topology tree: 26 * 27 * /bay=N 28 * | 29 * +---- /disk=0 <---------------- our Disk 30 * | 31 * +---- /bay=N?indicator=fail <---- the Fault Light 32 * \---- /bay=N?indicator=ident 33 * 34 * That is: a DISK node will have a parent BAY; that BAY will itself have 35 * child Facility nodes, one of which will be called "fail". If any of the 36 * above does not hold, we simply do nothing for this disk. 37 */ 38 39 #include <string.h> 40 #include <strings.h> 41 #include <libnvpair.h> 42 #include <fm/libtopo.h> 43 #include <fm/topo_list.h> 44 #include <fm/topo_hc.h> 45 #include <fm/fmd_api.h> 46 #include <sys/fm/protocol.h> 47 48 49 typedef struct disk_lights { 50 fmd_hdl_t *dl_fmd; 51 uint64_t dl_poll_interval; 52 uint64_t dl_coalesce_interval; 53 id_t dl_timer; 54 boolean_t dl_triggered; 55 } disk_lights_t; 56 57 static void disklights_topo(fmd_hdl_t *, topo_hdl_t *); 58 static void disklights_recv(fmd_hdl_t *, fmd_event_t *, nvlist_t *, 59 const char *); 60 static void disklights_timeout(fmd_hdl_t *, id_t, void *); 61 62 static const fmd_hdl_ops_t fmd_ops = { 63 disklights_recv, /* fmdo_recv */ 64 disklights_timeout, /* fmdo_timeout */ 65 NULL, /* fmdo_close */ 66 NULL, /* fmdo_stats */ 67 NULL, /* fmdo_gc */ 68 NULL, /* fmdo_send */ 69 disklights_topo, /* fmdo_topo */ 70 }; 71 72 /* 73 * POLL_INTERVAL is the period after which we perform an unsolicited poll 74 * to ensure we remain in sync with reality. 75 */ 76 #define DL_PROP_POLL_INTERVAL "poll-interval" 77 78 /* 79 * COALESCE_INTERVAL is how long we wait after we are trigged by either a 80 * topology change or a relevant list.* event, in order to allow a series 81 * of events to coalesce. 82 */ 83 #define DL_PROP_COALESCE_INTERVAL "coalesce-interval" 84 85 static const fmd_prop_t fmd_props[] = { 86 { DL_PROP_POLL_INTERVAL, FMD_TYPE_TIME, "5min" }, 87 { DL_PROP_COALESCE_INTERVAL, FMD_TYPE_TIME, "3s" }, 88 { NULL, 0, NULL } 89 }; 90 91 static const fmd_hdl_info_t fmd_info = { 92 "Disk Lights Agent", 93 "1.0", 94 &fmd_ops, 95 fmd_props 96 }; 97 98 /* 99 * Fetch the Facility Node properties (name, type) from the FMRI 100 * for this node, or return -1 if we can't. 101 */ 102 static int 103 get_facility_props(topo_hdl_t *hdl, tnode_t *node, char **facname, 104 char **factype) 105 { 106 int e, ret = -1; 107 nvlist_t *fmri = NULL, *fnvl; 108 char *nn = NULL, *tt = NULL; 109 110 if (topo_node_resource(node, &fmri, &e) != 0) 111 goto out; 112 113 if (nvlist_lookup_nvlist(fmri, FM_FMRI_FACILITY, &fnvl) != 0) 114 goto out; 115 116 if (nvlist_lookup_string(fnvl, FM_FMRI_FACILITY_NAME, &nn) != 0) 117 goto out; 118 119 if (nvlist_lookup_string(fnvl, FM_FMRI_FACILITY_TYPE, &tt) != 0) 120 goto out; 121 122 *facname = topo_hdl_strdup(hdl, nn); 123 *factype = topo_hdl_strdup(hdl, tt); 124 ret = 0; 125 126 out: 127 nvlist_free(fmri); 128 return (ret); 129 } 130 131 typedef struct dl_fault_walk_inner { 132 char *fwi_name; 133 uint32_t fwi_mode; 134 } dl_fault_walk_inner_t; 135 136 static int 137 dl_fault_walk_inner(topo_hdl_t *thp, tnode_t *node, void *arg) 138 { 139 dl_fault_walk_inner_t *fwi = arg; 140 char *facname = NULL, *factype = NULL; 141 int err; 142 143 /* 144 * We're only interested in BAY children that are valid Facility Nodes. 145 */ 146 if (topo_node_flags(node) != TOPO_NODE_FACILITY || 147 get_facility_props(thp, node, &facname, &factype) != 0) { 148 goto out; 149 } 150 151 if (strcmp(fwi->fwi_name, facname) != 0) 152 goto out; 153 154 /* 155 * Attempt to set the LED mode appropriately. If this fails, give up 156 * and move on. 157 */ 158 (void) topo_prop_set_uint32(node, TOPO_PGROUP_FACILITY, TOPO_LED_MODE, 159 TOPO_PROP_MUTABLE, fwi->fwi_mode, &err); 160 161 out: 162 topo_hdl_strfree(thp, facname); 163 topo_hdl_strfree(thp, factype); 164 return (TOPO_WALK_NEXT); 165 } 166 167 static int 168 dl_fault_walk_outer(topo_hdl_t *thp, tnode_t *node, void *arg) 169 { 170 disk_lights_t *dl = arg; 171 dl_fault_walk_inner_t fwi; 172 tnode_t *pnode; 173 int err, has_fault; 174 nvlist_t *fmri = NULL; 175 176 bzero(&fwi, sizeof (fwi)); 177 178 /* 179 * We are only looking for DISK nodes in the topology that have a parent 180 * BAY. 181 */ 182 if (strcmp(DISK, topo_node_name(node)) != 0 || 183 (pnode = topo_node_parent(node)) == NULL || 184 strcmp(BAY, topo_node_name(pnode)) != 0) { 185 return (TOPO_WALK_NEXT); 186 } 187 188 /* 189 * Check to see if the Resource this FMRI describes is Faulty: 190 */ 191 if (topo_node_resource(node, &fmri, &err) != 0) 192 return (TOPO_WALK_NEXT); 193 has_fault = fmd_nvl_fmri_has_fault(dl->dl_fmd, fmri, 194 FMD_HAS_FAULT_RESOURCE, NULL); 195 nvlist_free(fmri); 196 197 /* 198 * Walk the children of this BAY and flush out our fault status if 199 * we find an appropriate indicator node. 200 */ 201 fwi.fwi_name = "fail"; 202 fwi.fwi_mode = has_fault ? TOPO_LED_STATE_ON : TOPO_LED_STATE_OFF; 203 (void) topo_node_child_walk(thp, pnode, dl_fault_walk_inner, &fwi, 204 &err); 205 206 return (TOPO_WALK_NEXT); 207 } 208 209 /* 210 * Walk all of the topology nodes looking for DISKs that match the structure 211 * described in the overview. Once we find them, check their fault status 212 * and update their fault indiciator accordingly. 213 */ 214 static void 215 dl_examine_topo(disk_lights_t *dl) 216 { 217 int err; 218 topo_hdl_t *thp = NULL; 219 topo_walk_t *twp = NULL; 220 221 thp = fmd_hdl_topo_hold(dl->dl_fmd, TOPO_VERSION); 222 if ((twp = topo_walk_init(thp, FM_FMRI_SCHEME_HC, dl_fault_walk_outer, 223 dl, &err)) == NULL) { 224 fmd_hdl_error(dl->dl_fmd, "failed to get topology: %s\n", 225 topo_strerror(err)); 226 goto out; 227 } 228 229 if (topo_walk_step(twp, TOPO_WALK_CHILD) == TOPO_WALK_ERR) { 230 fmd_hdl_error(dl->dl_fmd, "failed to walk topology: %s\n", 231 topo_strerror(err)); 232 goto out; 233 } 234 235 out: 236 if (twp != NULL) 237 topo_walk_fini(twp); 238 if (thp != NULL) 239 fmd_hdl_topo_rele(dl->dl_fmd, thp); 240 } 241 242 static void 243 dl_trigger_enum(disk_lights_t *dl) 244 { 245 /* 246 * If we're already on the short-poll coalesce timer, then return 247 * immediately. 248 */ 249 if (dl->dl_triggered == B_TRUE) 250 return; 251 dl->dl_triggered = B_TRUE; 252 253 /* 254 * Replace existing poll timer with coalesce timer: 255 */ 256 if (dl->dl_timer != 0) 257 fmd_timer_remove(dl->dl_fmd, dl->dl_timer); 258 dl->dl_timer = fmd_timer_install(dl->dl_fmd, NULL, NULL, 259 dl->dl_coalesce_interval); 260 } 261 262 static void 263 disklights_timeout(fmd_hdl_t *hdl, id_t id, void *data) 264 { 265 disk_lights_t *dl = fmd_hdl_getspecific(hdl); 266 267 dl->dl_triggered = B_FALSE; 268 269 dl_examine_topo(dl); 270 271 /* 272 * Install the long-interval timer for the next poll. 273 */ 274 dl->dl_timer = fmd_timer_install(hdl, NULL, NULL, dl->dl_poll_interval); 275 } 276 277 static void 278 disklights_topo(fmd_hdl_t *hdl, topo_hdl_t *thp) 279 { 280 disk_lights_t *dl = fmd_hdl_getspecific(hdl); 281 282 dl_trigger_enum(dl); 283 } 284 285 static void 286 disklights_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, 287 const char *class) 288 { 289 disk_lights_t *dl = fmd_hdl_getspecific(hdl); 290 291 dl_trigger_enum(dl); 292 } 293 294 void 295 _fmd_init(fmd_hdl_t *hdl) 296 { 297 disk_lights_t *dl; 298 299 if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) 300 return; 301 302 dl = fmd_hdl_zalloc(hdl, sizeof (*dl), FMD_SLEEP); 303 fmd_hdl_setspecific(hdl, dl); 304 305 /* 306 * Load Configuration: 307 */ 308 dl->dl_fmd = hdl; 309 dl->dl_poll_interval = fmd_prop_get_int64(hdl, DL_PROP_POLL_INTERVAL); 310 dl->dl_coalesce_interval = fmd_prop_get_int64(hdl, 311 DL_PROP_COALESCE_INTERVAL); 312 313 /* 314 * Schedule the initial enumeration: 315 */ 316 dl_trigger_enum(dl); 317 } 318 319 void 320 _fmd_fini(fmd_hdl_t *hdl) 321 { 322 disk_lights_t *dl = fmd_hdl_getspecific(hdl); 323 324 fmd_hdl_free(hdl, dl, sizeof (*dl)); 325 }