1 #!/usr/bin/ksh
2 # CDDL HEADER START
3 #
4 # The contents of this file are subject to the terms of the
5 # Common Development and Distribution License (the "License").
6 # You may not use this file except in compliance with the License.
7 #
8 # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 # or http://www.opensolaris.org/os/licensing.
10 # See the License for the specific language governing permissions
11 # and limitations under the License.
12 #
13 # When distributing Covered Code, include this CDDL HEADER in each
14 # file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 # If applicable, add the following below this CDDL HEADER, with the
16 # fields enclosed by brackets "[]" replaced with your own identifying
17 # information: Portions Copyright [yyyy] [name of copyright owner]
18 #
19 # CDDL HEADER END
20 #
21 #
22 # Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 # Use is subject to license terms.
24 #
25 # NWS DataServices within SunCluster reconfiguration script.
26 #
27 # Description:
28 #
29 # This script is called from /usr/cluster/lib/sc/run_reserve at
30 # appropriate times to start and stop the NWS DataServices as SunCluster
31 # disk device groups are brought online or taken offline.
32 #
33 # SNDR configuration requires that a resource group to be configured.
34 # 1. The resource group name should be same as device group name with -stor-rg
35 # added. e.g. if device group name is abc-dg then resource group name
36 # would be abc-dg-stor-rg.
37 # 2. It should have 2 resources in it, unless one of the resource types is the
38 # SUNW.GeoCtlAVS. One of type SUNW.LogicalHostname and either SUNW.HAStorage
39 # or SUNW.HAStoragePlus types. Resource type versioning is ignored.
40 # HAStorage type resource, should have ServicePaths property set to
41 # device group name. HAStoragePlus type resource, should have either the
42 # FilesystemMountPoints pointing to a files system associated with the
43 # device group name, or GlobalDevicePaths property set to device group name.
44 # LogicalHostname type resource should have a failoverIP address in it and
45 # it will be used by SNDR to communicate with the secondary side.
46 #
47 # As SNDR requires that the LogicalHost (failover) IP address which is a
48 # part of resource group for SNDR, to be hosted on the same node where the
49 # device group is, it tries to move the resource group also alongwith the
50 # device group, in become_primary case of run_reserve script. While
51 # in primary_to_secondary case, it will try to kill the switchover function
52 # if it is still running in background, after stopping NWS data services.
53 #
54 # Usage:
55 #
56 # /usr/cluster/sbin/dscfg_reconfigure { start | stop } diskgroup
57 #
58 # Configuration:
59 #
60 # Scripts to be run should have been symlinked into $NWS_START_DIR and
61 # $NWS_STOP_DIR. Note that the scripts are processed in lexical order,
62 # and that unlike /etc/rc?.d/ there is no leading S or K character.
63 #
64 # Exit status:
65 #
66 # 0 - success
67 # 1 - error
68 #
69
70 #
71 # Global variables
72 #
73
74 # this program
75 typeset -r ARGV0=$(basename $0)
76
77 # directory full of start scripts
78 typeset -r NWS_START_DIR=/usr/cluster/lib/dscfg/start
79
80 # directory full of stop scripts
81 typeset -r NWS_STOP_DIR=/usr/cluster/lib/dscfg/stop
82
83 # the syslog facility to use.
84 # - conceptually this should be based on the output of
85 # "scha_cluster_get -O SYSLOG_FACILITY", but that won't work early
86 # during boot.
87 typeset -r SYSLOG_FACILITY=daemon
88
89 PATH=$PATH:/usr/cluster/bin:/etc
90
91 # Variables for retrying scswitch of Resource group for SNDR
92 retry_num=12
93 retry_interval=10
94 rgname=
95 rgstat=
96 skip_resource=0
97 count_LogicalHostname=0
98 count_HAStoragePlus=0
99
100 # Since the switchover of the resource group is called in background,
101 # the stop action of the reconfig script will kill the background switchover
102 # if it is running. Since we are stopping the NWS services on the node, there
103 # is no need to switch the resource group, so it is killed.
104 # The pid of the process is kept in file /var/run/scnws/$dg.pid.
105 # Input: dg - device group
106 # Output: Nothing, kills the process
107
108 function kill_scswitch
109 {
110 dg=$1
111 if [ -f /var/run/scnws/$dg.pid ]
112 then
113 for i in `cat /var/run/scnws/$dg.pid`
114 do
115 pid=$i
116 kill -9 $pid
117 done
118 rm -f /var/run/scnws/$dg.pid
119 fi
120 }
121
122 # Get the status of the resource group on this node, using scha commands.
123 # Input: resource group - $1
124 # Output: Status
125
126 function get_rgstat
127 {
128 rg=$1
129 rgstat=`scha_resourcegroup_get -O RG_STATE -G $rg`
130 }
131
132 # This function is called in background from do_scswitch function, to
133 # switch the resource group to this node, which is becoming primary for
134 # the diskgroup. If the status of resource group is Offline, it will use
135 # scswitch command to switch the resource group to this node. If it has
136 # become Online, cleanup pid file. If it is Pending, the resource group
137 # is in the state of becoming online, so wait for sometime to become Online..
138 # scswitch may fail, so the function retries $retry_num times, waiting for
139 # $retry_interval seconds.
140 # Input: resource group - $1, Diskgroup/Diskset - $2
141 # Output: 0 - success, 1 - failure
142
143 function switchfunc
144 {
145 rg=$1
146 dg=$2
147 how_many=0
148 sleep 2
149 while [ $how_many != $retry_num ]
150 do
151 get_rgstat $rg
152 case "$rgstat" in
153 "ONLINE")
154 rm -f /var/run/scnws/$dg.pid
155 return 0
156 ;;
157
158 "OFFLINE")
159 logger -p ${SYSLOG_FACILITY}.notice \
160 -t "NWS.[$ARGV0]" `gettext "scswitch of resource group"` "$rg"
161
162 scswitch -z -g $rg -h $(hostname)
163 retval=$?
164 if [ $retval != 0 ]
165 then
166 sleep $retry_interval
167 how_many=$(($how_many + 1))
168 fi
169 ;;
170
171 "PENDING_ONLINE")
172 logger -p ${SYSLOG_FACILITY}.notice \
173 -t "NWS.[$ARGV0]" `gettext "pending online of resource group"` "$rg"
174 sleep $retry_interval
175 how_many=$(($how_many + 1))
176 ;;
177
178 *)
179 logger -p ${SYSLOG_FACILITY}.notice \
180 -t "NWS.[$ARGV0]" `gettext "Improper resource group status for Remote Mirror"` "$rgstat"
181 rm -f /var/run/scnws/$dg.pid
182 return 1
183 ;;
184 esac
185 done
186 logger -p ${SYSLOG_FACILITY}.err \
187 -t "NWS.[$ARGV0]" "Did not switch resource group for Remote Mirror. System Administrator intervention required"
188 rm -f /var/run/scnws/$dg.pid
189 return 1
190 }
191
192
193 # This function calls switchfunc function in background, to switch the
194 # resource group for SNDR. It validates the diskgroup/diskset is configured
195 # for SNDR, checks if the resource group is in Managed state etc.
196 # If it detects a mis-configuration, it will disable SNDR for the
197 # device group being processed. This is to prevent cluster hangs and panics.
198 #
199 # The ServicePaths extension property of HAStorage type resource or the
200 # GlobalDevicePaths extension property of HAStoragePlus, both of which
201 # specify the device group, serve as a link or mapping to retrieve the
202 # resource group associated with the SNDR configured device group.
203 # Switchfunc is called in the background to avoid the deadlock situation arising
204 # out of switchover of resource group from within device group switchover.
205 #
206 # In run_reserve context, we are doing the device group switchover, trying to
207 # bring it online on the node. Device group is not completely switched online,
208 # until the calling script run_reserve returns. In the process, we are calling
209 # the associated SNDR resource group switchover using scswitch command.
210 # Resource group switchover will trigger the switchover of device group also.
211 #
212 # If resource group switchover is called in foreground, before the device
213 # group has become online, then it will result in switching the device group
214 # again, resulting in deadlock. Resource group can not become online until
215 # the device group is online and the device group can not become online until the
216 # script returns, causing this circular dependency resulting in deadlock.
217 #
218 # Calling the resource group switch in background allows current run_reserve
219 # script to return immediately, allowing device group to become online.
220 # If the device group is already online on the node, then the resource group
221 # does not cause the device group switchover again.
222 #
223 # Input: Device group dg - $1
224 # Output: 0 - success
225 # 1 - either dg not applicable for SNDR or error
226 # 2 - SNDR mis-configuration
227
228 function do_scswitch
229 {
230 dg=$1
231
232 if [ ! -x /usr/cluster/bin/scha_resource_get \
233 -o ! -x /usr/cluster/bin/scha_resourcegroup_get ]
234 then
235 return 1
236 fi
237
238 # hard coded rg name from dg
239 rgname="$dg-stor-rg"
240 scha_resourcegroup_get -O rg_description -G $rgname > /dev/null
241 if [ $? != 0 ]
242 then
243 # There is no device group configured in cluster for SNDR with this cluster tag
244 return 1
245 fi
246
247 # Check the state of resource group
248
249 get_rgstat $rgname
250 if [ -z "$rgstat" \
251 -o "$rgstat" = "UNMANAGED" -o "$rgstat" = "ERROR_STOP_FAILED" ]
252 then
253 logger -p ${SYSLOG_FACILITY}.notice \
254 -t "NWS.[$ARGV0]" \
255 `gettext "Improper Remote Mirror resource group state"` "$rgstat"
256 return 2
257 fi
258
259 # Check whether resources are of proper type and they are enabled
260
261 rs_list=`scha_resourcegroup_get -O resource_list -G $rgname`
262 if [ -z "$rs_list" ]
263 then
264 logger -p ${SYSLOG_FACILITY}.notice \
265 -t "NWS.[$ARGV0]" \
266 `gettext "No resources in Remote Mirror resource group <$rgname>"`
267 return 2
268 fi
269 for rs in $rs_list
270 do
271 rs_type=`scha_resource_get -O type -R $rs -G $rgname | cut -d':' -f1`
272 case "$rs_type" in
273 SUNW.LogicalHostname)
274 rs_enb=`scha_resource_get -O ON_OFF_SWITCH -R $rs -G $rgname`
275 if [ "$rs_enb" = "ENABLED" ]
276 then
277 count_LogicalHostname=$(($count_LogicalHostname + 1))
278 fi
279 ;;
280 SUNW.HAStoragePlus)
281 rs_enb=`scha_resource_get -O ON_OFF_SWITCH -R $rs -G $rgname`
282 if [ "$rs_enb" = "ENABLED" ]
283 then
284 count_HAStoragePlus=$(($count_HAStoragePlus + 1))
285 fi
286 ;;
287 esac
288 done
289 if [ $count_LogicalHostname -lt 1 ]
290 then
291 logger -p ${SYSLOG_FACILITY}.notice \
292 -t "NWS.[$ARGV0]" `gettext "Missing Enabled Logical Host in resource group <$rgname> for Remote Mirror"`
293 return 2
294 elif [ $count_LogicalHostname -gt 1 ]
295 then
296 logger -p ${SYSLOG_FACILITY}.notice \
297 -t "NWS.[$ARGV0]" `gettext "Too Many Enabled Logical Host in resource group <$rgname> for Remote Mirror"`
298 return 2
299 fi
300
301 if [ $count_HAStoragePlus -lt 1 ]
302 then
303 logger -p ${SYSLOG_FACILITY}.notice \
304 -t "NWS.[$ARGV0]" `gettext "Missing Enabled HAStoragePlus in resource group <$rgname> for Remote Mirror"`
305 return 2
306 elif [ $count_HAStoragePlus -gt 1 ]
307 then
308 logger -p ${SYSLOG_FACILITY}.notice \
309 -t "NWS.[$ARGV0]" `gettext "Too Many Enabled HAStoragePlus in resource group <$rgname> for Remote Mirror"`
310 return 2
311 fi
312
313 # Invoke switchfunc to switch the resource group.
314
315 switchfunc $rgname $dg &
316 pid=$!
317 mkdir -p /var/run/scnws/
318 rm -f /var/run/scnws/$dg.pid
319 echo $pid > /var/run/scnws/$dg.pid
320
321 return 0
322 }
323
324
325 #
326 # Functions
327 #
328
329 usage()
330 {
331 logger -p ${SYSLOG_FACILITY}.err \
332 -t "NWS.[$ARGV0]" "usage: $ARGV0 { start | stop } diskgroup"
333 exit 1
334 }
335
336
337 # Input: arg1) $NWS_START_DIR - location of NWS scripts
338 # arg2) start / stop
339 # arg3 ) device group - $2
340 # arg4) sndr_ena / sndr_dis
341 # Output: Nothing. Log error if seen
342
343 process_dir()
344 {
345 typeset dir=$1
346 typeset arg1=$2
347 typeset dg=$3
348 typeset arg2=$4
349 typeset RDC=$dir/10rdc
350
351 if [[ -d $dir ]]
352 then
353 for f in $dir/*
354 do
355 # process scripts in the directories in lexical order
356 # note - no leading S or K unlike /etc/rc?.d/
357
358 if [ -s $f ] && [ $arg2 != "sndr_dis" ]
359 then
360 # run script and pipe output through
361 # logger into syslog
362
363 /usr/bin/ksh $f $arg1 $dg 2>&1 |
364 logger -p ${SYSLOG_FACILITY}.notice \
365 -t "NWS.[${ARGV0}:$(basename $f)]"
366 else
367 # SNDR misconfigured - prevent start
368 if [ -s $f ] && [ $f != $RDC ]
369 then
370 # run script and pipe output through
371 # logger into syslog
372 /usr/bin/ksh $f $arg1 $dg 2>&1 |
373 logger -p ${SYSLOG_FACILITY}.notice \
374 -t "NWS.[${ARGV0}:$(basename $f)]"
375 fi
376 fi
377 done
378 else
379 logger -p ${SYSLOG_FACILITY}.err \
380 -t "NWS.[$ARGV0]" "no directory: $dir"
381 fi
382 }
383
384
385 #
386 # main
387 #
388
389 if [ $# -ne 2 ]
390 then
391 usage
392 # not reached
393 fi
394
395
396 case "$1" in
397 start)
398 logger -p ${SYSLOG_FACILITY}.notice -t "NWS.[$ARGV0]" "starting: $ARGV0 $*"
399 do_scswitch $2
400 retval=$?
401 if [ $retval == 2 ]
402 then
403 logger -p ${SYSLOG_FACILITY}.err \
404 -t "NWS.[$ARGV0]" "**FATAL ERROR** Remote Mirror is mis-configured and DISABLED for devicegroup <"$2"> "
405 # Disable SNDR
406 process_dir $NWS_START_DIR start "$2" sndr_dis
407 else
408 process_dir $NWS_START_DIR start "$2" sndr_ena
409 fi
410 ;;
411 stop)
412 logger -p ${SYSLOG_FACILITY}.notice -t "NWS.[$ARGV0]" "stopping: $ARGV0 $*"
413 process_dir $NWS_STOP_DIR stop "$2" sndr_ena
414 kill_scswitch $2
415 ;;
416
417 *)
418 usage
419 # not reached
420 ;;
421 esac
422
423 logger -p ${SYSLOG_FACILITY}.notice -t "NWS.[$ARGV0]" "completed: $ARGV0 $*"
424
425 exit 0