1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  14  */
  15 
  16 /*
  17  * Disk Lights Agent (FMA)
  18  *
  19  * This Fault Management Daemon (fmd) module periodically scans the topology
  20  * tree, enumerates all disks with associated fault indicators, and then
  21  * synchronises the fault status of resources in the FMA Resource Cache with
  22  * the indicators.  In short: it turns the fault light on for befallen disks.
  23  *
  24  * Presently, we recognise associated fault indicators for disks by looking
  25  * for the following structure in the topology tree:
  26  *
  27  *    /bay=N
  28  *      |
  29  *      +---- /disk=0   <---------------- our Disk
  30  *      |
  31  *      +---- /bay=N?indicator=fail <---- the Fault Light
  32  *      \---- /bay=N?indicator=ident
  33  *
  34  * That is: a DISK node will have a parent BAY; that BAY will itself have
  35  * child Facility nodes, one of which will be called "fail".  If any of the
  36  * above does not hold, we simply do nothing for this disk.
  37  */
  38 
  39 #include <string.h>
  40 #include <strings.h>
  41 #include <libnvpair.h>
  42 #include <fm/libtopo.h>
  43 #include <fm/topo_list.h>
  44 #include <fm/topo_hc.h>
  45 #include <fm/fmd_api.h>
  46 #include <sys/fm/protocol.h>
  47 
  48 
  49 typedef struct disk_lights {
  50         fmd_hdl_t *dl_fmd;
  51         uint64_t dl_poll_interval;
  52         uint64_t dl_coalesce_interval;
  53         id_t dl_timer;
  54         boolean_t dl_triggered;
  55 } disk_lights_t;
  56 
  57 static void disklights_topo(fmd_hdl_t *, topo_hdl_t *);
  58 static void disklights_recv(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
  59     const char *);
  60 static void disklights_timeout(fmd_hdl_t *, id_t, void *);
  61 
  62 static const fmd_hdl_ops_t fmd_ops = {
  63         disklights_recv,        /* fmdo_recv */
  64         disklights_timeout,     /* fmdo_timeout */
  65         NULL,                   /* fmdo_close */
  66         NULL,                   /* fmdo_stats */
  67         NULL,                   /* fmdo_gc */
  68         NULL,                   /* fmdo_send */
  69         disklights_topo,        /* fmdo_topo */
  70 };
  71 
  72 /*
  73  * POLL_INTERVAL is the period after which we perform an unsolicited poll
  74  * to ensure we remain in sync with reality.
  75  */
  76 #define DL_PROP_POLL_INTERVAL           "poll-interval"
  77 
  78 /*
  79  * COALESCE_INTERVAL is how long we wait after we are trigged by either a
  80  * topology change or a relevant list.* event, in order to allow a series
  81  * of events to coalesce.
  82  */
  83 #define DL_PROP_COALESCE_INTERVAL       "coalesce-interval"
  84 
  85 static const fmd_prop_t fmd_props[] = {
  86         { DL_PROP_POLL_INTERVAL, FMD_TYPE_TIME, "5min" },
  87         { DL_PROP_COALESCE_INTERVAL, FMD_TYPE_TIME, "3s" },
  88         { NULL, 0, NULL }
  89 };
  90 
  91 static const fmd_hdl_info_t fmd_info = {
  92         "Disk Lights Agent",
  93         "1.0",
  94         &fmd_ops,
  95         fmd_props
  96 };
  97 
  98 /*
  99  * Fetch the Facility Node properties (name, type) from the FMRI
 100  * for this node, or return -1 if we can't.
 101  */
 102 static int
 103 get_facility_props(topo_hdl_t *hdl, tnode_t *node, char **facname,
 104     char **factype)
 105 {
 106         int e, ret = -1;
 107         nvlist_t *fmri = NULL, *fnvl;
 108         char *nn = NULL, *tt = NULL;
 109 
 110         if (topo_node_resource(node, &fmri, &e) != 0)
 111                 goto out;
 112 
 113         if (nvlist_lookup_nvlist(fmri, FM_FMRI_FACILITY, &fnvl) != 0)
 114                 goto out;
 115 
 116         if (nvlist_lookup_string(fnvl, FM_FMRI_FACILITY_NAME, &nn) != 0)
 117                 goto out;
 118 
 119         if (nvlist_lookup_string(fnvl, FM_FMRI_FACILITY_TYPE, &tt) != 0)
 120                 goto out;
 121 
 122         *facname = topo_hdl_strdup(hdl, nn);
 123         *factype = topo_hdl_strdup(hdl, tt);
 124         ret = 0;
 125 
 126 out:
 127         nvlist_free(fmri);
 128         return (ret);
 129 }
 130 
 131 typedef struct dl_fault_walk_inner {
 132         char *fwi_name;
 133         uint32_t fwi_mode;
 134 } dl_fault_walk_inner_t;
 135 
 136 static int
 137 dl_fault_walk_inner(topo_hdl_t *thp, tnode_t *node, void *arg)
 138 {
 139         dl_fault_walk_inner_t *fwi = arg;
 140         char *facname = NULL, *factype = NULL;
 141         int err;
 142 
 143         /*
 144          * We're only interested in BAY children that are valid Facility Nodes.
 145          */
 146         if (topo_node_flags(node) != TOPO_NODE_FACILITY ||
 147             get_facility_props(thp, node, &facname, &factype) != 0) {
 148                 goto out;
 149         }
 150 
 151         if (strcmp(fwi->fwi_name, facname) != 0)
 152                 goto out;
 153 
 154         /*
 155          * Attempt to set the LED mode appropriately.  If this fails, give up
 156          * and move on.
 157          */
 158         (void) topo_prop_set_uint32(node, TOPO_PGROUP_FACILITY, TOPO_LED_MODE,
 159             TOPO_PROP_MUTABLE, fwi->fwi_mode, &err);
 160 
 161 out:
 162         topo_hdl_strfree(thp, facname);
 163         topo_hdl_strfree(thp, factype);
 164         return (TOPO_WALK_NEXT);
 165 }
 166 
 167 static int
 168 dl_fault_walk_outer(topo_hdl_t *thp, tnode_t *node, void *arg)
 169 {
 170         disk_lights_t *dl = arg;
 171         dl_fault_walk_inner_t fwi;
 172         tnode_t *pnode;
 173         int err, has_fault;
 174         nvlist_t *fmri = NULL;
 175 
 176         bzero(&fwi, sizeof (fwi));
 177 
 178         /*
 179          * We are only looking for DISK nodes in the topology that have a parent
 180          * BAY.
 181          */
 182         if (strcmp(DISK, topo_node_name(node)) != 0 ||
 183             (pnode = topo_node_parent(node)) == NULL ||
 184             strcmp(BAY, topo_node_name(pnode)) != 0) {
 185                 return (TOPO_WALK_NEXT);
 186         }
 187 
 188         /*
 189          * Check to see if the Resource this FMRI describes is Faulty:
 190          */
 191         if (topo_node_resource(node, &fmri, &err) != 0)
 192                 return (TOPO_WALK_NEXT);
 193         has_fault = fmd_nvl_fmri_has_fault(dl->dl_fmd, fmri,
 194             FMD_HAS_FAULT_RESOURCE, NULL);
 195         nvlist_free(fmri);
 196 
 197         /*
 198          * Walk the children of this BAY and flush out our fault status if
 199          * we find an appropriate indicator node.
 200          */
 201         fwi.fwi_name = "fail";
 202         fwi.fwi_mode = has_fault ? TOPO_LED_STATE_ON : TOPO_LED_STATE_OFF;
 203         (void) topo_node_child_walk(thp, pnode, dl_fault_walk_inner, &fwi,
 204             &err);
 205 
 206         return (TOPO_WALK_NEXT);
 207 }
 208 
 209 /*
 210  * Walk all of the topology nodes looking for DISKs that match the structure
 211  * described in the overview.  Once we find them, check their fault status
 212  * and update their fault indiciator accordingly.
 213  */
 214 static void
 215 dl_examine_topo(disk_lights_t *dl)
 216 {
 217         int err;
 218         topo_hdl_t *thp = NULL;
 219         topo_walk_t *twp = NULL;
 220 
 221         thp = fmd_hdl_topo_hold(dl->dl_fmd, TOPO_VERSION);
 222         if ((twp = topo_walk_init(thp, FM_FMRI_SCHEME_HC, dl_fault_walk_outer,
 223             dl, &err)) == NULL) {
 224                 fmd_hdl_error(dl->dl_fmd, "failed to get topology: %s\n",
 225                     topo_strerror(err));
 226                 goto out;
 227         }
 228 
 229         if (topo_walk_step(twp, TOPO_WALK_CHILD) == TOPO_WALK_ERR) {
 230                 fmd_hdl_error(dl->dl_fmd, "failed to walk topology: %s\n",
 231                     topo_strerror(err));
 232                 goto out;
 233         }
 234 
 235 out:
 236         if (twp != NULL)
 237                 topo_walk_fini(twp);
 238         if (thp != NULL)
 239                 fmd_hdl_topo_rele(dl->dl_fmd, thp);
 240 }
 241 
 242 static void
 243 dl_trigger_enum(disk_lights_t *dl)
 244 {
 245         /*
 246          * If we're already on the short-poll coalesce timer, then return
 247          * immediately.
 248          */
 249         if (dl->dl_triggered == B_TRUE)
 250                 return;
 251         dl->dl_triggered = B_TRUE;
 252 
 253         /*
 254          * Replace existing poll timer with coalesce timer:
 255          */
 256         if (dl->dl_timer != 0)
 257                 fmd_timer_remove(dl->dl_fmd, dl->dl_timer);
 258         dl->dl_timer = fmd_timer_install(dl->dl_fmd, NULL, NULL,
 259             dl->dl_coalesce_interval);
 260 }
 261 
 262 static void
 263 disklights_timeout(fmd_hdl_t *hdl, id_t id, void *data)
 264 {
 265         disk_lights_t *dl = fmd_hdl_getspecific(hdl);
 266 
 267         dl->dl_triggered = B_FALSE;
 268 
 269         dl_examine_topo(dl);
 270 
 271         /*
 272          * Install the long-interval timer for the next poll.
 273          */
 274         dl->dl_timer = fmd_timer_install(hdl, NULL, NULL, dl->dl_poll_interval);
 275 }
 276 
 277 static void
 278 disklights_topo(fmd_hdl_t *hdl, topo_hdl_t *thp)
 279 {
 280         disk_lights_t *dl = fmd_hdl_getspecific(hdl);
 281 
 282         dl_trigger_enum(dl);
 283 }
 284 
 285 static void
 286 disklights_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
 287     const char *class)
 288 {
 289         disk_lights_t *dl = fmd_hdl_getspecific(hdl);
 290 
 291         dl_trigger_enum(dl);
 292 }
 293 
 294 void
 295 _fmd_init(fmd_hdl_t *hdl)
 296 {
 297         disk_lights_t *dl;
 298 
 299         if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0)
 300                 return;
 301 
 302         dl = fmd_hdl_zalloc(hdl, sizeof (*dl), FMD_SLEEP);
 303         fmd_hdl_setspecific(hdl, dl);
 304 
 305         /*
 306          * Load Configuration:
 307          */
 308         dl->dl_fmd = hdl;
 309         dl->dl_poll_interval = fmd_prop_get_int64(hdl, DL_PROP_POLL_INTERVAL);
 310         dl->dl_coalesce_interval = fmd_prop_get_int64(hdl,
 311             DL_PROP_COALESCE_INTERVAL);
 312 
 313         /*
 314          * Schedule the initial enumeration:
 315          */
 316         dl_trigger_enum(dl);
 317 }
 318 
 319 void
 320 _fmd_fini(fmd_hdl_t *hdl)
 321 {
 322         disk_lights_t *dl = fmd_hdl_getspecific(hdl);
 323 
 324         fmd_hdl_free(hdl, dl, sizeof (*dl));
 325 }