Hi,
This is with CentOS 6.4 cluster. There are two physical machines in the cluster. I am planning to use this cluster to implement GFS2 over iSCSI. I am running into this problem of stopping rgmanager and it hangs indefinitely. I searched on the internet and others have similar problems. Does anyone know of a fix or have some ideas? Thank you.
============
root@localhost2 cluster]# clustat
Cluster Status for GFS2Cluster @ Tue Jun 11 11:16:54 2013
Member Status: Quorate
Member Name ID Status
------ ---- ---- ------
localhost1 1 Online
localhost2 2 Online, Local
=============
[root@localhost2 cluster]# /etc/init.d/rgmanager stop
Stopping Cluster Service Manager:
rgmanager stop hangs here, has to ctrl-c to get out
Same problem on the other node, localhost1, as well
iptables has been disabled on both nodes
=============
shutdown -ry now
hangs due to K01rgmanager stop
getting stuck
[root@localhost2 cluster]# ps -ef |grep man
root 5884 5842 0 09:35 ? 00:00:08 /bin/bash /etc/rc6.d/K01rgmanager stop
root 7475 1 0 Jun10 ? 00:00:00 rgmanager
root 7477 7475 0 Jun10 ? 00:00:00 rgmanager
root 27279 6481 0 11:15 pts/1 00:00:00 grep man
[root@localhost2 cluster]#
Now, if I issue "kill 5884" to stop the hanging process, the physical machine would hang and not responding to commands, and I would have to physically power off the power button and power it back on.
How do you shutdown the machine gracefully here? <================
==============
[root@localhost2 log]# grep -i fence messages
Jun 11 08:37:12 localhost2 fenced[1591]: fencing node localhost1 still retrying
Jun 11 09:07:53 localhost2 fenced[1591]: fencing node localhost1 still retrying
Jun 11 09:38:33 localhost2 fenced[1591]: fencing node localhost1 still retrying
Jun 11 10:09:14 localhost2 fenced[1591]: fencing node localhost1 still retrying
Jun 11 10:39:55 localhost2 fenced[1591]: fencing node localhost1 still retrying
Jun 11 10:58:58 localhost2 fence_node[23857]: fence localhost2 failed
Jun 11 10:59:34 localhost2 fence_node[23993]: status localhost2 failed -1
Jun 11 11:10:35 localhost2 fenced[1591]: fencing node localhost1 still retrying
==============
[root@localhost2 cluster]# rpm -qa |grep -i cluster
clusterlib-3.0.12.1-49.el6.x86_64
cluster-glue-libs-1.0.5-6.el6.x86_64
modcluster-0.16.2-20.el6.x86_64
lvm2-cluster-2.02.98-9.el6.x86_64
==============
[root@localhost2 cluster]# cat cluster.conf
(this file is set up according to some suggestions on the internet)
<?xml version="1.0"?>
<cluster name="GFS2Cluster" config_version="3">
<cman two_node="1" expected_votes="1"/>
<clusternodes>
<clusternode name="localhost1" votes="1" nodeid="1">
<fence>
<method name="single">
</method>
</fence>
</clusternode>
<clusternode name="localhost2" votes="1" nodeid="2">
<fence>
<method name="single">
</method>
</fence>
</clusternode>
</clusternodes>
<fencedevices>
<fencedevice name="localhost1_ipmi" agent="fence_manual"/><fencedevice name="localhost2_ipmi" agent="fence_manual"/></fencedevices>
<rm>
<failoverdomains/>
<resources/>
</rm>
</cluster>
[root@localhost2 cluster]#
==================
Running rgmanager stop with strace:
[root@localhost2 cluster]# strace /etc/init.d/rgmanager stop
... tons of cryptic codes followed by:
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
dup2(11, 2) = 2
fcntl(11, F_GETFD) = 0x1 (flags FD_CLOEXEC)
close(11) = 0
dup2(10, 1) = 1
fcntl(10, F_GETFD) = 0x1 (flags FD_CLOEXEC)
close(10) = 0
rt_sigprocmask(SIG_BLOCK, [CHLD], [], 8) = 0
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
rt_sigprocmask(SIG_BLOCK, [INT CHLD], [], 8) = 0
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f0cde0039d0) = 28535
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
rt_sigprocmask(SIG_BLOCK, [CHLD], [], 8) = 0
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
rt_sigprocmask(SIG_BLOCK, [CHLD], [], 8) = 0
rt_sigaction(SIGINT, {0x43d060, [], SA_RESTORER, 0x7f0cdd669920}, {SIG_DFL, [], SA_RESTORER, 0x7f0cdd669920}, 8) = 0
wait4(-1,
Stuck here
===============
This is with CentOS 6.4 cluster. There are two physical machines in the cluster. I am planning to use this cluster to implement GFS2 over iSCSI. I am running into this problem of stopping rgmanager and it hangs indefinitely. I searched on the internet and others have similar problems. Does anyone know of a fix or have some ideas? Thank you.
============
root@localhost2 cluster]# clustat
Cluster Status for GFS2Cluster @ Tue Jun 11 11:16:54 2013
Member Status: Quorate
Member Name ID Status
------ ---- ---- ------
localhost1 1 Online
localhost2 2 Online, Local
=============
[root@localhost2 cluster]# /etc/init.d/rgmanager stop
Stopping Cluster Service Manager:
rgmanager stop hangs here, has to ctrl-c to get out
Same problem on the other node, localhost1, as well
iptables has been disabled on both nodes
=============
shutdown -ry now
hangs due to K01rgmanager stop
getting stuck
[root@localhost2 cluster]# ps -ef |grep man
root 5884 5842 0 09:35 ? 00:00:08 /bin/bash /etc/rc6.d/K01rgmanager stop
root 7475 1 0 Jun10 ? 00:00:00 rgmanager
root 7477 7475 0 Jun10 ? 00:00:00 rgmanager
root 27279 6481 0 11:15 pts/1 00:00:00 grep man
[root@localhost2 cluster]#
Now, if I issue "kill 5884" to stop the hanging process, the physical machine would hang and not responding to commands, and I would have to physically power off the power button and power it back on.
How do you shutdown the machine gracefully here? <================
==============
[root@localhost2 log]# grep -i fence messages
Jun 11 08:37:12 localhost2 fenced[1591]: fencing node localhost1 still retrying
Jun 11 09:07:53 localhost2 fenced[1591]: fencing node localhost1 still retrying
Jun 11 09:38:33 localhost2 fenced[1591]: fencing node localhost1 still retrying
Jun 11 10:09:14 localhost2 fenced[1591]: fencing node localhost1 still retrying
Jun 11 10:39:55 localhost2 fenced[1591]: fencing node localhost1 still retrying
Jun 11 10:58:58 localhost2 fence_node[23857]: fence localhost2 failed
Jun 11 10:59:34 localhost2 fence_node[23993]: status localhost2 failed -1
Jun 11 11:10:35 localhost2 fenced[1591]: fencing node localhost1 still retrying
==============
[root@localhost2 cluster]# rpm -qa |grep -i cluster
clusterlib-3.0.12.1-49.el6.x86_64
cluster-glue-libs-1.0.5-6.el6.x86_64
modcluster-0.16.2-20.el6.x86_64
lvm2-cluster-2.02.98-9.el6.x86_64
==============
[root@localhost2 cluster]# cat cluster.conf
(this file is set up according to some suggestions on the internet)
<?xml version="1.0"?>
<cluster name="GFS2Cluster" config_version="3">
<cman two_node="1" expected_votes="1"/>
<clusternodes>
<clusternode name="localhost1" votes="1" nodeid="1">
<fence>
<method name="single">
</method>
</fence>
</clusternode>
<clusternode name="localhost2" votes="1" nodeid="2">
<fence>
<method name="single">
</method>
</fence>
</clusternode>
</clusternodes>
<fencedevices>
<fencedevice name="localhost1_ipmi" agent="fence_manual"/><fencedevice name="localhost2_ipmi" agent="fence_manual"/></fencedevices>
<rm>
<failoverdomains/>
<resources/>
</rm>
</cluster>
[root@localhost2 cluster]#
==================
Running rgmanager stop with strace:
[root@localhost2 cluster]# strace /etc/init.d/rgmanager stop
... tons of cryptic codes followed by:
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
dup2(11, 2) = 2
fcntl(11, F_GETFD) = 0x1 (flags FD_CLOEXEC)
close(11) = 0
dup2(10, 1) = 1
fcntl(10, F_GETFD) = 0x1 (flags FD_CLOEXEC)
close(10) = 0
rt_sigprocmask(SIG_BLOCK, [CHLD], [], 8) = 0
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
rt_sigprocmask(SIG_BLOCK, [INT CHLD], [], 8) = 0
clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0x7f0cde0039d0) = 28535
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
rt_sigprocmask(SIG_BLOCK, [CHLD], [], 8) = 0
rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0
rt_sigprocmask(SIG_BLOCK, [CHLD], [], 8) = 0
rt_sigaction(SIGINT, {0x43d060, [], SA_RESTORER, 0x7f0cdd669920}, {SIG_DFL, [], SA_RESTORER, 0x7f0cdd669920}, 8) = 0
wait4(-1,
Stuck here
===============