1 /*
   2  * This file and its contents are supplied under the terms of the
   3  * Common Development and Distribution License ("CDDL"), version 1.0.
   4  * You may only use this file in accordance with the terms of version
   5  * 1.0 of the CDDL.
   6  *
   7  * A full copy of the text of the CDDL should have accompanied this
   8  * source.  A copy of the CDDL is also available via the Internet at
   9  * http://www.illumos.org/license/CDDL.
  10  */
  11 
  12 /*
  13  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  14  */
  15 
  16 /*
  17  * Disk Lights Agent (FMA)
  18  *
  19  * This Fault Management Daemon (fmd) module periodically scans the topology
  20  * tree, enumerates all disks with associated fault indicators, and then
  21  * synchronises the fault status of resources in the FMA Resource Cache with
  22  * the indicators.  In short: it turns the fault light on for befallen disks.
  23  *
  24  * Presently, we recognise associated fault indicators for disks by looking
  25  * for the following structure in the topology tree:
  26  *
  27  *    /bay=N
  28  *      |
  29  *      +---- /disk=0   <---------------- our Disk
  30  *      |
  31  *      +---- /bay=N?indicator=fail <---- the Fault Light
  32  *      \---- /bay=N?indicator=ident
  33  *
  34  * That is: a DISK node will have a parent BAY; that BAY will itself have
  35  * child Facility nodes, one of which will be called "fail".  If any of the
  36  * above does not hold, we simply do nothing for this disk.
  37  */
  38 
  39 #include <string.h>
  40 #include <strings.h>
  41 #include <libnvpair.h>
  42 #include <fm/libtopo.h>
  43 #include <fm/topo_list.h>
  44 #include <fm/topo_hc.h>
  45 #include <fm/fmd_api.h>
  46 #include <sys/fm/protocol.h>
  47 
  48 
  49 typedef struct disk_lights {
  50         fmd_hdl_t *dl_fmd;
  51         uint64_t dl_poll_interval;
  52         uint64_t dl_coalesce_interval;
  53         id_t dl_timer;
  54         boolean_t dl_triggered;
  55 } disk_lights_t;
  56 
  57 static void disklights_topo(fmd_hdl_t *, topo_hdl_t *);
  58 static void disklights_recv(fmd_hdl_t *, fmd_event_t *, nvlist_t *,
  59     const char *);
  60 static void disklights_timeout(fmd_hdl_t *, id_t, void *);
  61 
  62 static const fmd_hdl_ops_t fmd_ops = {
  63         disklights_recv,        /* fmdo_recv */
  64         disklights_timeout,     /* fmdo_timeout */
  65         NULL,                   /* fmdo_close */
  66         NULL,                   /* fmdo_stats */
  67         NULL,                   /* fmdo_gc */
  68         NULL,                   /* fmdo_send */
  69         disklights_topo,        /* fmdo_topo */
  70 };
  71 
  72 /*
  73  * POLL_INTERVAL is the period after which we perform an unsolicited poll
  74  * to ensure we remain in sync with reality.
  75  */
  76 #define DL_PROP_POLL_INTERVAL           "poll-interval"
  77 
  78 /*
  79  * COALESCE_INTERVAL is how long we wait after we are trigged by either a
  80  * topology change or a relevant list.* event, in order to allow a series
  81  * of events to coalesce.
  82  */
  83 #define DL_PROP_COALESCE_INTERVAL       "coalesce-interval"
  84 
  85 static const fmd_prop_t fmd_props[] = {
  86         { DL_PROP_POLL_INTERVAL, FMD_TYPE_TIME, "5min" },
  87         { DL_PROP_COALESCE_INTERVAL, FMD_TYPE_TIME, "3s" },
  88         { NULL, 0, NULL }
  89 };
  90 
  91 static const fmd_hdl_info_t fmd_info = {
  92         "Disk Lights Agent",
  93         "1.0",
  94         &fmd_ops,
  95         fmd_props
  96 };
  97 
  98 /*
  99  * Fetch the Facility Node properties (name, type) from the FMRI
 100  * for this node, or return -1 if we can't.
 101  */
 102 static int
 103 get_facility_props(tnode_t *node, char **facname, char **factype)
 104 {
 105         int e;
 106         nvlist_t *fmri, *fnvl;
 107 
 108         if (topo_node_resource(node, &fmri, &e) != 0)
 109                 return (-1);
 110 
 111         if (nvlist_lookup_nvlist(fmri, FM_FMRI_FACILITY, &fnvl) != 0)
 112                 return (-1);
 113 
 114         if (nvlist_lookup_string(fnvl, FM_FMRI_FACILITY_NAME, facname) != 0)
 115                 return (-1);
 116 
 117         if (nvlist_lookup_string(fnvl, FM_FMRI_FACILITY_TYPE, factype) != 0)
 118                 return (-1);
 119 
 120         return (0);
 121 }
 122 
 123 typedef struct dl_fault_walk_inner {
 124         char *fwi_name;
 125         uint32_t fwi_mode;
 126 } dl_fault_walk_inner_t;
 127 
 128 static int
 129 dl_fault_walk_inner(topo_hdl_t *thp, tnode_t *node, void *arg)
 130 {
 131         dl_fault_walk_inner_t *fwi = arg;
 132         char *facname, *factype;
 133         int err;
 134 
 135         /*
 136          * We're only interested in BAY children that are valid Facility Nodes.
 137          */
 138         if (topo_node_flags(node) != TOPO_NODE_FACILITY ||
 139             get_facility_props(node, &facname, &factype) != 0)
 140                 return (TOPO_WALK_NEXT);
 141 
 142         if (strcmp(fwi->fwi_name, facname) != 0)
 143                 return (TOPO_WALK_NEXT);
 144 
 145         /*
 146          * Attempt to set the LED mode appropriately.  If this fails, give up
 147          * and move on.
 148          */
 149         (void) topo_prop_set_uint32(node, TOPO_PGROUP_FACILITY, TOPO_LED_MODE,
 150             TOPO_PROP_MUTABLE, fwi->fwi_mode, &err);
 151 
 152         return (TOPO_WALK_NEXT);
 153 }
 154 
 155 static int
 156 dl_fault_walk_outer(topo_hdl_t *thp, tnode_t *node, void *arg)
 157 {
 158         disk_lights_t *dl = arg;
 159         dl_fault_walk_inner_t fwi;
 160         tnode_t *pnode;
 161         int err, has_fault;
 162         nvlist_t *fmri = NULL;
 163 
 164         bzero(&fwi, sizeof (fwi));
 165 
 166         /*
 167          * We are only looking for DISK nodes in the topology that have a parent
 168          * BAY.
 169          */
 170         if (strcmp(DISK, topo_node_name(node)) != 0 ||
 171             (pnode = topo_node_parent(node)) == NULL ||
 172             strcmp(BAY, topo_node_name(pnode)) != 0)
 173                 return (TOPO_WALK_NEXT);
 174 
 175         /*
 176          * Check to see if the Resource this FMRI describes is Faulty:
 177          */
 178         if (topo_node_resource(node, &fmri, &err) != 0)
 179                 return (TOPO_WALK_NEXT);
 180         has_fault = fmd_nvl_fmri_has_fault(dl->dl_fmd, fmri,
 181             FMD_HAS_FAULT_RESOURCE, NULL);
 182         nvlist_free(fmri);
 183 
 184 
 185         /*
 186          * Walk the children of this BAY and flush out our fault status if
 187          * we find an appropriate indicator node.
 188          */
 189         fwi.fwi_name = "fail";
 190         fwi.fwi_mode = has_fault ? TOPO_LED_STATE_ON : TOPO_LED_STATE_OFF;
 191         (void) topo_node_child_walk(thp, pnode, dl_fault_walk_inner, &fwi);
 192 
 193         return (TOPO_WALK_NEXT);
 194 }
 195 
 196 /*
 197  * Walk all of the topology nodes looking for DISKs that match the structure
 198  * described in the overview.  Once we find them, check their fault status
 199  * and update their fault indiciator accordingly.
 200  */
 201 static void
 202 dl_examine_topo(disk_lights_t *dl)
 203 {
 204         int err;
 205         topo_hdl_t *thp = NULL;
 206         topo_walk_t *twp = NULL;
 207 
 208         thp = fmd_hdl_topo_hold(dl->dl_fmd, TOPO_VERSION);
 209         if ((twp = topo_walk_init(thp, FM_FMRI_SCHEME_HC, dl_fault_walk_outer,
 210             dl, &err)) == NULL) {
 211                 fmd_hdl_error(dl->dl_fmd, "failed to get topology: %s\n",
 212                     topo_strerror(err));
 213                 goto out;
 214         }
 215 
 216         if (topo_walk_step(twp, TOPO_WALK_CHILD) == TOPO_WALK_ERR) {
 217                 fmd_hdl_error(dl->dl_fmd, "failed to walk topology: %s\n",
 218                     topo_strerror(err));
 219                 goto out;
 220         }
 221 
 222 out:
 223         if (twp != NULL)
 224                 topo_walk_fini(twp);
 225         if (thp != NULL)
 226                 fmd_hdl_topo_rele(dl->dl_fmd, thp);
 227 }
 228 
 229 static void
 230 dl_trigger_enum(disk_lights_t *dl)
 231 {
 232         /*
 233          * If we're already on the short-poll coalesce timer, then return
 234          * immediately.
 235          */
 236         if (dl->dl_triggered == B_TRUE)
 237                 return;
 238         dl->dl_triggered = B_TRUE;
 239 
 240         /*
 241          * Replace existing poll timer with coalesce timer:
 242          */
 243         if (dl->dl_timer != 0)
 244                 fmd_timer_remove(dl->dl_fmd, dl->dl_timer);
 245         dl->dl_timer = fmd_timer_install(dl->dl_fmd, NULL, NULL,
 246             dl->dl_coalesce_interval);
 247 }
 248 
 249 static void
 250 disklights_timeout(fmd_hdl_t *hdl, id_t id, void *data)
 251 {
 252         disk_lights_t *dl = fmd_hdl_getspecific(hdl);
 253 
 254         dl->dl_triggered = B_FALSE;
 255 
 256         dl_examine_topo(dl);
 257 
 258         /*
 259          * Install the long-interval timer for the next poll.
 260          */
 261         dl->dl_timer = fmd_timer_install(hdl, NULL, NULL, dl->dl_poll_interval);
 262 }
 263 
 264 static void
 265 disklights_topo(fmd_hdl_t *hdl, topo_hdl_t *thp)
 266 {
 267         disk_lights_t *dl = fmd_hdl_getspecific(hdl);
 268 
 269         dl_trigger_enum(dl);
 270 }
 271 
 272 static void
 273 disklights_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
 274     const char *class)
 275 {
 276         disk_lights_t *dl = fmd_hdl_getspecific(hdl);
 277 
 278         dl_trigger_enum(dl);
 279 }
 280 
 281 void
 282 _fmd_init(fmd_hdl_t *hdl)
 283 {
 284         disk_lights_t *dl;
 285 
 286         if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0)
 287                 return;
 288 
 289         dl = fmd_hdl_zalloc(hdl, sizeof (*dl), FMD_SLEEP);
 290         fmd_hdl_setspecific(hdl, dl);
 291 
 292         /*
 293          * Load Configuration:
 294          */
 295         dl->dl_fmd = hdl;
 296         dl->dl_poll_interval = fmd_prop_get_int64(hdl, DL_PROP_POLL_INTERVAL);
 297         dl->dl_coalesce_interval = fmd_prop_get_int64(hdl,
 298             DL_PROP_COALESCE_INTERVAL);
 299 
 300         /*
 301          * Schedule the initial enumeration:
 302          */
 303         dl_trigger_enum(dl);
 304 }
 305 
 306 void
 307 _fmd_fini(fmd_hdl_t *hdl)
 308 {
 309         disk_lights_t *dl = fmd_hdl_getspecific(hdl);
 310 
 311         fmd_hdl_free(hdl, dl, sizeof (*dl));
 312 }