1 #!/usr/bin/ksh
   3 #
   4 # The contents of this file are subject to the terms of the
   5 # Common Development and Distribution License (the "License").
   6 # You may not use this file except in compliance with the License.
   7 #
   8 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9 # or http://www.opensolaris.org/os/licensing.
  10 # See the License for the specific language governing permissions
  11 # and limitations under the License.
  12 #
  13 # When distributing Covered Code, include this CDDL HEADER in each
  14 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15 # If applicable, add the following below this CDDL HEADER, with the
  16 # fields enclosed by brackets "[]" replaced with your own identifying
  17 # information: Portions Copyright [yyyy] [name of copyright owner]
  18 #
  20 #
  21 #
  22 # Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  23 # Use is subject to license terms.
  24 #
  25 # NWS DataServices within SunCluster reconfiguration script.
  26 #
  27 # Description:
  28 #
  29 # This script is called from /usr/cluster/lib/sc/run_reserve at
  30 # appropriate times to start and stop the NWS DataServices as SunCluster
  31 # disk device groups are brought online or taken offline.
  32 #
  33 # SNDR configuration requires that a resource group to be configured.
  34 # 1. The resource group name should be same as device group name with -stor-rg
  35 #    added. e.g. if device group name is abc-dg then resource group name
  36 #    would be abc-dg-stor-rg. 
  37 # 2. It should have 2 resources in it, unless one of the resource types is the
  38 #    SUNW.GeoCtlAVS. One of type SUNW.LogicalHostname and either SUNW.HAStorage
  39 #    or SUNW.HAStoragePlus types. Resource type versioning is ignored.
  40 #    HAStorage type resource, should have ServicePaths property set to
  41 #    device group name. HAStoragePlus type resource, should have either the
  42 #    FilesystemMountPoints pointing to a files system associated with the
  43 #    device group name, or GlobalDevicePaths property set to device group name.
  44 #    LogicalHostname type resource should have a failoverIP address in it and
  45 #    it will be used by SNDR to communicate with the secondary side.
  46 #
  47 # As SNDR requires that the LogicalHost (failover) IP address which is a
  48 # part of resource group for SNDR, to be hosted on the same node where the 
  49 # device group is, it tries to move the resource group also alongwith the
  50 # device group, in become_primary case of run_reserve script. While
  51 # in primary_to_secondary case, it will try to kill the switchover function
  52 # if it is still running in background, after stopping NWS data services.
  53 # 
  54 # Usage:
  55 #
  56 # /usr/cluster/sbin/dscfg_reconfigure { start | stop } diskgroup
  57 #
  58 # Configuration:
  59 #
  60 # Scripts to be run should have been symlinked into $NWS_START_DIR and
  61 # $NWS_STOP_DIR.  Note that the scripts are processed in lexical order,
  62 # and that unlike /etc/rc?.d/ there is no leading S or K character.
  63 #
  64 # Exit status:
  65 #
  66 # 0 - success
  67 # 1 - error
  68 #
  70 #
  71 # Global variables
  72 #
  74 # this program
  75 typeset -r ARGV0=$(basename $0)
  77 # directory full of start scripts
  78 typeset -r NWS_START_DIR=/usr/cluster/lib/dscfg/start
  80 # directory full of stop scripts
  81 typeset -r NWS_STOP_DIR=/usr/cluster/lib/dscfg/stop
  83 # the syslog facility to use.
  84 # - conceptually this should be based on the output of
  85 #   "scha_cluster_get -O SYSLOG_FACILITY", but that won't work early
  86 #   during boot.
  87 typeset -r SYSLOG_FACILITY=daemon
  89 PATH=$PATH:/usr/cluster/bin:/etc
  91 # Variables for retrying scswitch of Resource group for SNDR
  92 retry_num=12
  93 retry_interval=10
  94 rgname=
  95 rgstat=
  96 skip_resource=0
  97 count_LogicalHostname=0
  98 count_HAStoragePlus=0
 100 # Since the switchover of the resource group is called in background,    
 101 # the stop action of the reconfig script will kill the background switchover
 102 # if it is running. Since we are stopping the NWS services on the node, there
 103 # is no need to switch the resource group, so  it is killed.
 104 # The pid of the process is kept in file /var/run/scnws/$dg.pid.
 105 # Input:  dg - device group
 106 # Output: Nothing, kills the process
 108 function kill_scswitch
 109 {
 110         dg=$1
 111         if [ -f /var/run/scnws/$dg.pid ]
 112         then
 113                 for i in `cat /var/run/scnws/$dg.pid`
 114                 do
 115                         pid=$i
 116                         kill -9 $pid
 117                 done
 118                 rm -f /var/run/scnws/$dg.pid
 119         fi
 120 }
 122 # Get the status of the resource group on this node, using scha commands.
 123 # Input: resource group - $1
 124 # Output: Status
 126 function get_rgstat
 127 {
 128         rg=$1
 129         rgstat=`scha_resourcegroup_get -O RG_STATE -G $rg`
 130 }
 132 # This function is called in background from do_scswitch function, to
 133 # switch the resource group to this node, which is becoming primary for
 134 # the diskgroup. If the status of resource group is Offline, it will use
 135 # scswitch command to switch the resource group to this node. If it has
 136 # become Online, cleanup pid file. If it is Pending, the resource group
 137 # is in the state of becoming online, so wait for sometime to become Online..
 138 # scswitch may fail, so the function retries $retry_num times, waiting for
 139 # $retry_interval seconds.
 140 # Input: resource group - $1, Diskgroup/Diskset - $2
 141 # Output: 0 - success, 1 - failure
 143 function switchfunc
 144 {
 145         rg=$1
 146         dg=$2
 147         how_many=0
 148         sleep 2
 149         while [ $how_many != $retry_num ]
 150         do
 151                 get_rgstat $rg
 152                 case "$rgstat" in
 153                 "ONLINE")
 154                         rm -f /var/run/scnws/$dg.pid
 155                         return 0
 156                         ;;
 158                 "OFFLINE")
 159                         logger -p ${SYSLOG_FACILITY}.notice \
 160                         -t "NWS.[$ARGV0]" `gettext "scswitch of resource group"` "$rg"
 162                         scswitch -z -g $rg -h $(hostname)
 163                         retval=$?
 164                         if [ $retval != 0 ]
 165                         then
 166                                 sleep $retry_interval
 167                                 how_many=$(($how_many + 1))
 168                         fi
 169                         ;;
 171                 "PENDING_ONLINE")
 172                         logger -p ${SYSLOG_FACILITY}.notice \
 173                         -t "NWS.[$ARGV0]" `gettext "pending online of resource group"` "$rg"
 174                         sleep $retry_interval
 175                         how_many=$(($how_many + 1))
 176                         ;;
 178                 *)
 179                         logger -p ${SYSLOG_FACILITY}.notice \
 180                         -t "NWS.[$ARGV0]" `gettext "Improper resource group status for Remote Mirror"` "$rgstat"
 181                         rm -f /var/run/scnws/$dg.pid
 182                         return 1
 183                         ;;      
 184                 esac
 185         done
 186         logger -p ${SYSLOG_FACILITY}.err \
 187         -t "NWS.[$ARGV0]" "Did not switch resource group for Remote Mirror. System Administrator intervention required"
 188         rm -f /var/run/scnws/$dg.pid
 189         return 1
 190 }
 193 # This function calls switchfunc function in background, to switch the 
 194 # resource group for SNDR. It validates the diskgroup/diskset is configured 
 195 # for SNDR, checks if the resource group is in Managed state etc.
 196 # If it detects a mis-configuration, it will disable SNDR for the
 197 # device group being processed. This is to prevent cluster hangs and panics.
 198 #  
 199 # The ServicePaths extension property of HAStorage type resource or the
 200 # GlobalDevicePaths extension property of HAStoragePlus, both of which
 201 # specify the device group, serve as a link or mapping to retrieve the 
 202 # resource group associated with the SNDR configured device group.
 203 # Switchfunc is called in the background to avoid the deadlock situation arising
 204 # out of switchover of resource group from within device group switchover.
 205 #
 206 # In run_reserve context, we are doing the device group switchover, trying to
 207 # bring it online on the node. Device group is not completely switched online,
 208 # until the calling script run_reserve returns. In the process, we are calling
 209 # the associated SNDR resource group switchover using scswitch command. 
 210 # Resource group switchover will trigger the switchover of device group also. 
 211 #
 212 # If resource group switchover is called in foreground, before the device 
 213 # group has become online, then it will result in switching the device group 
 214 # again, resulting in deadlock. Resource group can not become online until 
 215 # the device group is online and the device group can not become online until the 
 216 # script returns, causing this circular dependency resulting in deadlock. 
 217 #
 218 # Calling the resource group switch in background allows current run_reserve
 219 # script to return immediately, allowing device group to become online.
 220 # If the device group is already online on the node, then the resource group 
 221 # does not cause the device group switchover again.
 222 #
 223 # Input: Device group dg - $1
 224 # Output: 0 - success
 225 #         1 - either dg not applicable for SNDR or error
 226 #         2 - SNDR mis-configuration
 228 function do_scswitch
 229 {
 230         dg=$1
 232         if [ ! -x /usr/cluster/bin/scha_resource_get \
 233                 -o ! -x /usr/cluster/bin/scha_resourcegroup_get ]
 234         then
 235                 return 1
 236         fi
 238 # hard coded rg name from dg
 239         rgname="$dg-stor-rg"
 240         scha_resourcegroup_get -O rg_description -G $rgname > /dev/null
 241         if [ $? != 0 ]
 242         then
 243 # There is no device group configured in cluster for SNDR with this cluster tag
 244                 return 1
 245         fi
 247 # Check the state of resource group
 249         get_rgstat $rgname
 250         if [ -z "$rgstat" \
 251                 -o "$rgstat" = "UNMANAGED" -o "$rgstat" = "ERROR_STOP_FAILED" ]
 252         then
 253                 logger -p ${SYSLOG_FACILITY}.notice \
 254                 -t "NWS.[$ARGV0]" \
 255                 `gettext "Improper Remote Mirror resource group state"` "$rgstat"
 256                 return 2 
 257         fi
 259 # Check whether resources are of proper type and they are enabled
 261         rs_list=`scha_resourcegroup_get -O resource_list -G $rgname`
 262         if [ -z "$rs_list" ]
 263         then
 264                 logger -p ${SYSLOG_FACILITY}.notice \
 265                 -t "NWS.[$ARGV0]" \
 266                 `gettext "No resources in Remote Mirror resource group <$rgname>"`
 267                 return 2 
 268         fi
 269         for rs in $rs_list
 270         do
 271                 rs_type=`scha_resource_get -O type -R $rs -G $rgname  | cut -d':' -f1`
 272                 case "$rs_type" in
 273                 SUNW.LogicalHostname)
 274                         rs_enb=`scha_resource_get -O ON_OFF_SWITCH -R $rs -G $rgname`
 275                         if [ "$rs_enb" = "ENABLED" ]
 276                         then
 277                         count_LogicalHostname=$(($count_LogicalHostname + 1))
 278                         fi
 279                         ;;
 280                 SUNW.HAStoragePlus)
 281                         rs_enb=`scha_resource_get -O ON_OFF_SWITCH -R $rs -G $rgname`
 282                         if [ "$rs_enb" = "ENABLED" ]
 283                         then
 284                         count_HAStoragePlus=$(($count_HAStoragePlus + 1))
 285                         fi
 286                         ;;
 287                 esac
 288         done
 289         if [ $count_LogicalHostname -lt 1 ]
 290         then
 291                 logger -p ${SYSLOG_FACILITY}.notice \
 292                 -t "NWS.[$ARGV0]" `gettext "Missing Enabled Logical Host in resource group <$rgname> for Remote Mirror"`
 293                 return 2
 294         elif [ $count_LogicalHostname -gt 1 ]
 295         then
 296                 logger -p ${SYSLOG_FACILITY}.notice \
 297                 -t "NWS.[$ARGV0]" `gettext "Too Many Enabled Logical Host in resource group <$rgname> for Remote Mirror"`
 298                 return 2
 299         fi
 301         if [ $count_HAStoragePlus -lt 1 ]
 302         then
 303                 logger -p ${SYSLOG_FACILITY}.notice \
 304                 -t "NWS.[$ARGV0]" `gettext "Missing Enabled HAStoragePlus in resource group <$rgname> for Remote Mirror"`
 305                 return 2
 306         elif [ $count_HAStoragePlus -gt 1 ]
 307         then
 308                 logger -p ${SYSLOG_FACILITY}.notice \
 309                 -t "NWS.[$ARGV0]" `gettext "Too Many Enabled HAStoragePlus in resource group <$rgname> for Remote Mirror"`
 310                 return 2
 311         fi
 313 # Invoke switchfunc to switch the resource group. 
 315         switchfunc $rgname $dg &
 316         pid=$!
 317         mkdir -p /var/run/scnws/
 318         rm -f /var/run/scnws/$dg.pid
 319         echo $pid > /var/run/scnws/$dg.pid
 321         return 0
 322 }
 325 #
 326 # Functions
 327 #
 329 usage()
 330 {
 331         logger -p ${SYSLOG_FACILITY}.err \
 332             -t "NWS.[$ARGV0]" "usage: $ARGV0 { start | stop } diskgroup"
 333         exit 1
 334 }
 337 # Input: arg1) $NWS_START_DIR - location of NWS scripts
 338 #        arg2) start / stop
 339 #        arg3 ) device group - $2
 340 #        arg4) sndr_ena / sndr_dis
 341 # Output: Nothing. Log error if seen
 343 process_dir()
 344 {
 345         typeset dir=$1
 346         typeset arg1=$2
 347         typeset dg=$3
 348         typeset arg2=$4
 349         typeset RDC=$dir/10rdc
 351         if [[ -d $dir ]]
 352         then
 353                 for f in $dir/*
 354                 do
 355                         # process scripts in the directories in lexical order
 356                         # note - no leading S or K unlike /etc/rc?.d/
 358                         if [ -s $f ] && [ $arg2 != "sndr_dis" ]   
 359                         then
 360                                 # run script and pipe output through
 361                                 # logger into syslog
 363                                 /usr/bin/ksh $f $arg1 $dg 2>&1 |
 364                                     logger -p ${SYSLOG_FACILITY}.notice \
 365                                         -t "NWS.[${ARGV0}:$(basename $f)]"
 366                         else
 367                         # SNDR misconfigured - prevent start
 368                             if [ -s $f ] && [ $f != $RDC ] 
 369                             then
 370                                 # run script and pipe output through
 371                                 # logger into syslog
 372                                 /usr/bin/ksh $f $arg1 $dg 2>&1 |
 373                                     logger -p ${SYSLOG_FACILITY}.notice \
 374                                         -t "NWS.[${ARGV0}:$(basename $f)]"
 375                             fi
 376                         fi
 377                 done
 378         else
 379                 logger -p ${SYSLOG_FACILITY}.err \
 380                     -t "NWS.[$ARGV0]" "no directory: $dir"
 381         fi
 382 }
 385 #
 386 # main
 387 #
 389 if [ $# -ne 2 ]
 390 then
 391         usage
 392         # not reached
 393 fi
 396 case "$1" in
 397 start)
 398         logger -p ${SYSLOG_FACILITY}.notice -t "NWS.[$ARGV0]" "starting: $ARGV0 $*"
 399         do_scswitch $2
 400         retval=$?
 401         if [ $retval == 2 ]
 402         then
 403                 logger -p ${SYSLOG_FACILITY}.err \
 404                     -t "NWS.[$ARGV0]" "**FATAL ERROR** Remote Mirror is mis-configured and DISABLED for devicegroup <"$2"> " 
 405                 # Disable SNDR 
 406                 process_dir $NWS_START_DIR start "$2" sndr_dis
 407         else
 408                 process_dir $NWS_START_DIR start "$2" sndr_ena
 409         fi
 410         ;;
 411 stop)
 412         logger -p ${SYSLOG_FACILITY}.notice -t "NWS.[$ARGV0]" "stopping: $ARGV0 $*"
 413         process_dir $NWS_STOP_DIR stop "$2" sndr_ena
 414         kill_scswitch $2
 415         ;;
 417 *)
 418         usage
 419         # not reached
 420         ;;
 421 esac
 423 logger -p ${SYSLOG_FACILITY}.notice -t "NWS.[$ARGV0]" "completed: $ARGV0 $*"
 425 exit 0