1 #!/usr/bin/ksh 2 # CDDL HEADER START 3 # 4 # The contents of this file are subject to the terms of the 5 # Common Development and Distribution License (the "License"). 6 # You may not use this file except in compliance with the License. 7 # 8 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 # or http://www.opensolaris.org/os/licensing. 10 # See the License for the specific language governing permissions 11 # and limitations under the License. 12 # 13 # When distributing Covered Code, include this CDDL HEADER in each 14 # file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 # If applicable, add the following below this CDDL HEADER, with the 16 # fields enclosed by brackets "[]" replaced with your own identifying 17 # information: Portions Copyright [yyyy] [name of copyright owner] 18 # 19 # CDDL HEADER END 20 # 21 # 22 # Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 # Use is subject to license terms. 24 # 25 # NWS DataServices within SunCluster reconfiguration script. 26 # 27 # Description: 28 # 29 # This script is called from /usr/cluster/lib/sc/run_reserve at 30 # appropriate times to start and stop the NWS DataServices as SunCluster 31 # disk device groups are brought online or taken offline. 32 # 33 # SNDR configuration requires that a resource group to be configured. 34 # 1. The resource group name should be same as device group name with -stor-rg 35 # added. e.g. if device group name is abc-dg then resource group name 36 # would be abc-dg-stor-rg. 37 # 2. It should have 2 resources in it, unless one of the resource types is the 38 # SUNW.GeoCtlAVS. One of type SUNW.LogicalHostname and either SUNW.HAStorage 39 # or SUNW.HAStoragePlus types. Resource type versioning is ignored. 40 # HAStorage type resource, should have ServicePaths property set to 41 # device group name. HAStoragePlus type resource, should have either the 42 # FilesystemMountPoints pointing to a files system associated with the 43 # device group name, or GlobalDevicePaths property set to device group name. 44 # LogicalHostname type resource should have a failoverIP address in it and 45 # it will be used by SNDR to communicate with the secondary side. 46 # 47 # As SNDR requires that the LogicalHost (failover) IP address which is a 48 # part of resource group for SNDR, to be hosted on the same node where the 49 # device group is, it tries to move the resource group also alongwith the 50 # device group, in become_primary case of run_reserve script. While 51 # in primary_to_secondary case, it will try to kill the switchover function 52 # if it is still running in background, after stopping NWS data services. 53 # 54 # Usage: 55 # 56 # /usr/cluster/sbin/dscfg_reconfigure { start | stop } diskgroup 57 # 58 # Configuration: 59 # 60 # Scripts to be run should have been symlinked into $NWS_START_DIR and 61 # $NWS_STOP_DIR. Note that the scripts are processed in lexical order, 62 # and that unlike /etc/rc?.d/ there is no leading S or K character. 63 # 64 # Exit status: 65 # 66 # 0 - success 67 # 1 - error 68 # 69 70 # 71 # Global variables 72 # 73 74 # this program 75 typeset -r ARGV0=$(basename $0) 76 77 # directory full of start scripts 78 typeset -r NWS_START_DIR=/usr/cluster/lib/dscfg/start 79 80 # directory full of stop scripts 81 typeset -r NWS_STOP_DIR=/usr/cluster/lib/dscfg/stop 82 83 # the syslog facility to use. 84 # - conceptually this should be based on the output of 85 # "scha_cluster_get -O SYSLOG_FACILITY", but that won't work early 86 # during boot. 87 typeset -r SYSLOG_FACILITY=daemon 88 89 PATH=$PATH:/usr/cluster/bin:/etc 90 91 # Variables for retrying scswitch of Resource group for SNDR 92 retry_num=12 93 retry_interval=10 94 rgname= 95 rgstat= 96 skip_resource=0 97 count_LogicalHostname=0 98 count_HAStoragePlus=0 99 100 # Since the switchover of the resource group is called in background, 101 # the stop action of the reconfig script will kill the background switchover 102 # if it is running. Since we are stopping the NWS services on the node, there 103 # is no need to switch the resource group, so it is killed. 104 # The pid of the process is kept in file /var/run/scnws/$dg.pid. 105 # Input: dg - device group 106 # Output: Nothing, kills the process 107 108 function kill_scswitch 109 { 110 dg=$1 111 if [ -f /var/run/scnws/$dg.pid ] 112 then 113 for i in `cat /var/run/scnws/$dg.pid` 114 do 115 pid=$i 116 kill -9 $pid 117 done 118 rm -f /var/run/scnws/$dg.pid 119 fi 120 } 121 122 # Get the status of the resource group on this node, using scha commands. 123 # Input: resource group - $1 124 # Output: Status 125 126 function get_rgstat 127 { 128 rg=$1 129 rgstat=`scha_resourcegroup_get -O RG_STATE -G $rg` 130 } 131 132 # This function is called in background from do_scswitch function, to 133 # switch the resource group to this node, which is becoming primary for 134 # the diskgroup. If the status of resource group is Offline, it will use 135 # scswitch command to switch the resource group to this node. If it has 136 # become Online, cleanup pid file. If it is Pending, the resource group 137 # is in the state of becoming online, so wait for sometime to become Online.. 138 # scswitch may fail, so the function retries $retry_num times, waiting for 139 # $retry_interval seconds. 140 # Input: resource group - $1, Diskgroup/Diskset - $2 141 # Output: 0 - success, 1 - failure 142 143 function switchfunc 144 { 145 rg=$1 146 dg=$2 147 how_many=0 148 sleep 2 149 while [ $how_many != $retry_num ] 150 do 151 get_rgstat $rg 152 case "$rgstat" in 153 "ONLINE") 154 rm -f /var/run/scnws/$dg.pid 155 return 0 156 ;; 157 158 "OFFLINE") 159 logger -p ${SYSLOG_FACILITY}.notice \ 160 -t "NWS.[$ARGV0]" `gettext "scswitch of resource group"` "$rg" 161 162 scswitch -z -g $rg -h $(hostname) 163 retval=$? 164 if [ $retval != 0 ] 165 then 166 sleep $retry_interval 167 how_many=$(($how_many + 1)) 168 fi 169 ;; 170 171 "PENDING_ONLINE") 172 logger -p ${SYSLOG_FACILITY}.notice \ 173 -t "NWS.[$ARGV0]" `gettext "pending online of resource group"` "$rg" 174 sleep $retry_interval 175 how_many=$(($how_many + 1)) 176 ;; 177 178 *) 179 logger -p ${SYSLOG_FACILITY}.notice \ 180 -t "NWS.[$ARGV0]" `gettext "Improper resource group status for Remote Mirror"` "$rgstat" 181 rm -f /var/run/scnws/$dg.pid 182 return 1 183 ;; 184 esac 185 done 186 logger -p ${SYSLOG_FACILITY}.err \ 187 -t "NWS.[$ARGV0]" "Did not switch resource group for Remote Mirror. System Administrator intervention required" 188 rm -f /var/run/scnws/$dg.pid 189 return 1 190 } 191 192 193 # This function calls switchfunc function in background, to switch the 194 # resource group for SNDR. It validates the diskgroup/diskset is configured 195 # for SNDR, checks if the resource group is in Managed state etc. 196 # If it detects a mis-configuration, it will disable SNDR for the 197 # device group being processed. This is to prevent cluster hangs and panics. 198 # 199 # The ServicePaths extension property of HAStorage type resource or the 200 # GlobalDevicePaths extension property of HAStoragePlus, both of which 201 # specify the device group, serve as a link or mapping to retrieve the 202 # resource group associated with the SNDR configured device group. 203 # Switchfunc is called in the background to avoid the deadlock situation arising 204 # out of switchover of resource group from within device group switchover. 205 # 206 # In run_reserve context, we are doing the device group switchover, trying to 207 # bring it online on the node. Device group is not completely switched online, 208 # until the calling script run_reserve returns. In the process, we are calling 209 # the associated SNDR resource group switchover using scswitch command. 210 # Resource group switchover will trigger the switchover of device group also. 211 # 212 # If resource group switchover is called in foreground, before the device 213 # group has become online, then it will result in switching the device group 214 # again, resulting in deadlock. Resource group can not become online until 215 # the device group is online and the device group can not become online until the 216 # script returns, causing this circular dependency resulting in deadlock. 217 # 218 # Calling the resource group switch in background allows current run_reserve 219 # script to return immediately, allowing device group to become online. 220 # If the device group is already online on the node, then the resource group 221 # does not cause the device group switchover again. 222 # 223 # Input: Device group dg - $1 224 # Output: 0 - success 225 # 1 - either dg not applicable for SNDR or error 226 # 2 - SNDR mis-configuration 227 228 function do_scswitch 229 { 230 dg=$1 231 232 if [ ! -x /usr/cluster/bin/scha_resource_get \ 233 -o ! -x /usr/cluster/bin/scha_resourcegroup_get ] 234 then 235 return 1 236 fi 237 238 # hard coded rg name from dg 239 rgname="$dg-stor-rg" 240 scha_resourcegroup_get -O rg_description -G $rgname > /dev/null 241 if [ $? != 0 ] 242 then 243 # There is no device group configured in cluster for SNDR with this cluster tag 244 return 1 245 fi 246 247 # Check the state of resource group 248 249 get_rgstat $rgname 250 if [ -z "$rgstat" \ 251 -o "$rgstat" = "UNMANAGED" -o "$rgstat" = "ERROR_STOP_FAILED" ] 252 then 253 logger -p ${SYSLOG_FACILITY}.notice \ 254 -t "NWS.[$ARGV0]" \ 255 `gettext "Improper Remote Mirror resource group state"` "$rgstat" 256 return 2 257 fi 258 259 # Check whether resources are of proper type and they are enabled 260 261 rs_list=`scha_resourcegroup_get -O resource_list -G $rgname` 262 if [ -z "$rs_list" ] 263 then 264 logger -p ${SYSLOG_FACILITY}.notice \ 265 -t "NWS.[$ARGV0]" \ 266 `gettext "No resources in Remote Mirror resource group <$rgname>"` 267 return 2 268 fi 269 for rs in $rs_list 270 do 271 rs_type=`scha_resource_get -O type -R $rs -G $rgname | cut -d':' -f1` 272 case "$rs_type" in 273 SUNW.LogicalHostname) 274 rs_enb=`scha_resource_get -O ON_OFF_SWITCH -R $rs -G $rgname` 275 if [ "$rs_enb" = "ENABLED" ] 276 then 277 count_LogicalHostname=$(($count_LogicalHostname + 1)) 278 fi 279 ;; 280 SUNW.HAStoragePlus) 281 rs_enb=`scha_resource_get -O ON_OFF_SWITCH -R $rs -G $rgname` 282 if [ "$rs_enb" = "ENABLED" ] 283 then 284 count_HAStoragePlus=$(($count_HAStoragePlus + 1)) 285 fi 286 ;; 287 esac 288 done 289 if [ $count_LogicalHostname -lt 1 ] 290 then 291 logger -p ${SYSLOG_FACILITY}.notice \ 292 -t "NWS.[$ARGV0]" `gettext "Missing Enabled Logical Host in resource group <$rgname> for Remote Mirror"` 293 return 2 294 elif [ $count_LogicalHostname -gt 1 ] 295 then 296 logger -p ${SYSLOG_FACILITY}.notice \ 297 -t "NWS.[$ARGV0]" `gettext "Too Many Enabled Logical Host in resource group <$rgname> for Remote Mirror"` 298 return 2 299 fi 300 301 if [ $count_HAStoragePlus -lt 1 ] 302 then 303 logger -p ${SYSLOG_FACILITY}.notice \ 304 -t "NWS.[$ARGV0]" `gettext "Missing Enabled HAStoragePlus in resource group <$rgname> for Remote Mirror"` 305 return 2 306 elif [ $count_HAStoragePlus -gt 1 ] 307 then 308 logger -p ${SYSLOG_FACILITY}.notice \ 309 -t "NWS.[$ARGV0]" `gettext "Too Many Enabled HAStoragePlus in resource group <$rgname> for Remote Mirror"` 310 return 2 311 fi 312 313 # Invoke switchfunc to switch the resource group. 314 315 switchfunc $rgname $dg & 316 pid=$! 317 mkdir -p /var/run/scnws/ 318 rm -f /var/run/scnws/$dg.pid 319 echo $pid > /var/run/scnws/$dg.pid 320 321 return 0 322 } 323 324 325 # 326 # Functions 327 # 328 329 usage() 330 { 331 logger -p ${SYSLOG_FACILITY}.err \ 332 -t "NWS.[$ARGV0]" "usage: $ARGV0 { start | stop } diskgroup" 333 exit 1 334 } 335 336 337 # Input: arg1) $NWS_START_DIR - location of NWS scripts 338 # arg2) start / stop 339 # arg3 ) device group - $2 340 # arg4) sndr_ena / sndr_dis 341 # Output: Nothing. Log error if seen 342 343 process_dir() 344 { 345 typeset dir=$1 346 typeset arg1=$2 347 typeset dg=$3 348 typeset arg2=$4 349 typeset RDC=$dir/10rdc 350 351 if [[ -d $dir ]] 352 then 353 for f in $dir/* 354 do 355 # process scripts in the directories in lexical order 356 # note - no leading S or K unlike /etc/rc?.d/ 357 358 if [ -s $f ] && [ $arg2 != "sndr_dis" ] 359 then 360 # run script and pipe output through 361 # logger into syslog 362 363 /usr/bin/ksh $f $arg1 $dg 2>&1 | 364 logger -p ${SYSLOG_FACILITY}.notice \ 365 -t "NWS.[${ARGV0}:$(basename $f)]" 366 else 367 # SNDR misconfigured - prevent start 368 if [ -s $f ] && [ $f != $RDC ] 369 then 370 # run script and pipe output through 371 # logger into syslog 372 /usr/bin/ksh $f $arg1 $dg 2>&1 | 373 logger -p ${SYSLOG_FACILITY}.notice \ 374 -t "NWS.[${ARGV0}:$(basename $f)]" 375 fi 376 fi 377 done 378 else 379 logger -p ${SYSLOG_FACILITY}.err \ 380 -t "NWS.[$ARGV0]" "no directory: $dir" 381 fi 382 } 383 384 385 # 386 # main 387 # 388 389 if [ $# -ne 2 ] 390 then 391 usage 392 # not reached 393 fi 394 395 396 case "$1" in 397 start) 398 logger -p ${SYSLOG_FACILITY}.notice -t "NWS.[$ARGV0]" "starting: $ARGV0 $*" 399 do_scswitch $2 400 retval=$? 401 if [ $retval == 2 ] 402 then 403 logger -p ${SYSLOG_FACILITY}.err \ 404 -t "NWS.[$ARGV0]" "**FATAL ERROR** Remote Mirror is mis-configured and DISABLED for devicegroup <"$2"> " 405 # Disable SNDR 406 process_dir $NWS_START_DIR start "$2" sndr_dis 407 else 408 process_dir $NWS_START_DIR start "$2" sndr_ena 409 fi 410 ;; 411 stop) 412 logger -p ${SYSLOG_FACILITY}.notice -t "NWS.[$ARGV0]" "stopping: $ARGV0 $*" 413 process_dir $NWS_STOP_DIR stop "$2" sndr_ena 414 kill_scswitch $2 415 ;; 416 417 *) 418 usage 419 # not reached 420 ;; 421 esac 422 423 logger -p ${SYSLOG_FACILITY}.notice -t "NWS.[$ARGV0]" "completed: $ARGV0 $*" 424 425 exit 0