Re: [OSR-users] Problem with VG activation clvmd runs at 100%

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

We are getting there.
Now I'd like to see the output of this command:
com-chroot ls -l /var/run

Thanks
Marc.

----- Original Message -----
From: "Jorge Silva" <me...@je...>
To: "Marc Grimme" <gr...@at...>
Sent: Monday, November 12, 2012 5:46:58 PM
Subject: Re: Problem with VG activation clvmd runs at 100%

Marc 

Hi, sorry, here it is 

total 308
drwxr-xr-x 28 root      root      3864 Nov 12 11:00 .
drwxr-xr-x 16 root      root      3864 Nov 11 11:51 ..
-rw-r--r--  1 root      root         6 Nov 12 10:58 auditd.pid
drwxr-xr-x  2 avahi     avahi     3864 Nov 12 10:58 avahi-daemon
srwxr-xr-x  1 root      root         0 Nov 12 10:58 clumond.sock
drwx--x--x  4 root      root      3864 Nov 11 18:01 cluster
drwxr-xr-x  2 root      root      3864 Nov  2 19:51 console
drwxr-xr-x  2 root      root      3864 Nov 12 11:00 ConsoleKit
-rw-r--r--  1 root      root         6 Nov 12 11:00 console-kit-daemon.pid
-rw-r--r--  1 root      root         6 Nov 12 10:58 crond.pid
----------  1 root      root         0 Nov 12 10:58 cron.reboot
drwxr-xr-x  2 root      root      3864 Nov 12 10:58 dbus
drwxr-xr-x  2 root      root      3864 Apr 16  2012 faillock
drwx------  2 haldaemon haldaemon 3864 Jul 19  2011 hald
-rw-r--r--  1 root      root         6 Nov 12 10:58 haldaemon.pid
drwxr-xr-x  3 root      root      3864 Jul 30 20:34 heartbeat
drwx--x---  2 root      apache    3864 Nov 11 17:04 httpd
-rw-r--r--  1 root      root         6 Nov 12 10:58 ksmtune.pid
drwxr-xr-x  5 root      root      3864 Nov 12 10:58 libvirt
-rw-r--r--  1 root      root         5 Nov 12 10:58 libvirtd.pid
-rw-------  1 root      root        32 Nov 12 10:58 lldpad.pid
drwx------  2 root      root      3864 Nov 12 10:58 lvm
drwx------  2 root      root      3864 Jun 22 08:32 mdadm
-rw-r--r--  1 root      root         6 Nov 12 10:58 messagebus.pid
-rw-r--r--  1 root      root         5 Nov 12 10:58 modclusterd.pid
-rw-r--r--  1 root      root         5 Nov 12 10:57 multipathd.pid
srwx------  1 root      root         0 Nov 12 10:57 multipathd.sock
drwxr-xr-x  2 mysql     mysql     3864 Nov 11 16:11 mysqld
drwxrwxr-x  2 root      root      3864 Sep 17 05:55 netreport
drwxr-xr-x  2 root      root      3864 Jul 23 02:09 net-snmp
drwxr-xr-x  2 root      root      3864 Nov 10 17:55 nscd
drwxr-xr-x  2 nslcd     root      3864 Nov 10 11:57 nslcd
-rw-r--r--  1 root      root         5 Nov 12 10:58 ntpd.pid
-rw-------  1 root      root         5 Nov 12 10:58 oddjobd.pid
drwxr-xr-x  2 root      root      3864 Dec  8  2011 plymouth
drwxr-xr-x  5 root      root      3864 Nov  8 14:57 pm-utils
drwxr-xr-x  2 root      root      3864 Apr  3  2012 portreserve
drwxr-xr-x  2 root      root      3864 Aug 22  2010 ppp
drwxr-xr-x  2 radvd     radvd     3864 Nov 11  2010 radvd
-rw-r--r--  1 root      root         5 Nov 12 10:58 ricci.pid
-r--r--r--  1 root      root         0 Nov 12 10:58 rpcbind.lock
-rw-r--r--  1 root      root         6 Nov 12 10:58 rpcbind.pid
srw-rw-rw-  1 root      root         0 Nov 12 10:58 rpcbind.sock
drwxr-xr-x  2 root      root      3864 Nov 12 10:58 saslauthd
-rw-------  1 root      smmsp       34 Nov 12 10:58 sendmail.pid
drwxr-xr-x  2 root      root      3864 Apr 16  2012 sepermit
drwxr-xr-x  2 root      root      3864 Jun 22 03:51 setrans
-rw-r--r--  1 smmsp     smmsp       50 Nov 12 10:58 sm-client.pid
-rw-------  1 root      root         6 Nov 12 10:58 snmpd.pid
-rw-r--r--  1 root      root         6 Nov 12 10:58 sshd.pid
-rw-------  1 root      root         6 Nov 12 10:58 sssd.pid
-rw-------  1 root      root         6 Nov 12 10:58 syslogd.pid
-rw-rw-r--  1 root      utmp      3456 Nov 12 11:00 utmp
drwxr-xr-x  2 root      root      3864 Jun 22 06:05 wpa_supplicant 

On Mon, Nov 12, 2012 at 11:40 AM, Marc Grimme < gr...@at... > wrote: 

ls -l /var/run is missing, right? 

Marc. 

----- Original Message ----- 
From: "Jorge Silva" < me...@je... > 
To: "Marc Grimme" < gr...@at... > 

Sent: Monday, November 12, 2012 5:38:21 PM 
Subject: Re: Problem with VG activation clvmd runs at 100% 

Marc 

Hi, cluster.conf, is attatched. I did have it running yesterday, but then things went pear-shaped. 

/etc/cdsltab 

bind /.cluster/cdsl/%(nodeid)s/var/run /var/run __initrd 
bind /.cluster/cdsl/%(nodeid)s/var/lock /var/lock __initrd 

ls -l /var/run 
rootfs / rootfs rw 0 0 
udev /dev devtmpfs rw,nosuid,relatime,size=12337912k,nr_inodes=3084478,mode=755 0 0 
/dev/pts /dev/pts devpts rw,relatime,gid=5,mode=620,ptmxmode=000 0 0 
tmpfs /dev/shm tmpfs rw,nosuid,nodev,relatime 0 0 
none /var/comoonics/chroot tmpfs rw,relatime 0 0 
none /var/comoonics/chroot/dev tmpfs rw,relatime 0 0 
none /var/comoonics/chroot/dev/pts devpts rw,relatime,gid=5,mode=620,ptmxmode=000 0 0 
proc /var/comoonics/chroot/proc proc rw,relatime 0 0 
sysfs /var/comoonics/chroot/sys sysfs rw,relatime 0 0 
none /var/comoonics/chroot/sys/kernel/config configfs rw,relatime 0 0 
/dev/dm-7 / gfs2 rw,noatime,hostdata=jid=0,localflocks 0 0 
/dev/dm-7 /.cdsl.local gfs2 rw,noatime,hostdata=jid=0,localflocks 0 0 
/dev/dm-7 /var/run gfs2 rw,noatime,hostdata=jid=0,localflocks 0 0 
/dev/dm-7 /var/lock gfs2 rw,noatime,hostdata=jid=0,localflocks 0 0 
proc /proc proc rw,relatime 0 0 
sysfs /sys sysfs rw,relatime 0 0 
/proc/bus/usb /proc/bus/usb usbfs rw,relatime 0 0 
/dev/mapper/osbootpp1 /boot ext4 rw,relatime,barrier=1,data=ordered 0 0 
none /proc/sys/fs/binfmt_misc binfmt_misc rw,relatime 0 0 
cgroup /cgroup/cpuset cgroup rw,relatime,cpuset 0 0 
cgroup /cgroup/cpu cgroup rw,relatime,cpu 0 0 
cgroup /cgroup/cpuacct cgroup rw,relatime,cpuacct 0 0 
cgroup /cgroup/memory cgroup rw,relatime,memory 0 0 
cgroup /cgroup/devices cgroup rw,relatime,devices 0 0 
cgroup /cgroup/freezer cgroup rw,relatime,freezer 0 0 
cgroup /cgroup/net_cls cgroup rw,relatime,net_cls 0 0 
cgroup /cgroup/blkio cgroup rw,relatime,blkio 0 0 
On Mon, Nov 12, 2012 at 11:21 AM, Marc Grimme < gr...@at... > wrote: 

Ok we need to look deeper into things. 

Could you send me /etc/cluster/cluster.conf, and - if there - /etc/cdsltab. 

Also the output of the following commands 
* cat /proc/mounts 
* ls -l /var/run 

Marc. 

----- Original Message ----- 
From: "Jorge Silva" < me...@je... > 
To: "Marc Grimme" < gr...@at... > 

Sent: Monday, November 12, 2012 5:04:42 PM 
Subject: Re: Problem with VG activation clvmd runs at 100% 

Marc 

I have a boot log with the set +x (in text), much easier than jpgs.. 

and attatched is lvm.conf 

Thanks 
Jorge 

On Mon, Nov 12, 2012 at 10:53 AM, Marc Grimme < gr...@at... > wrote: 

Ok, one last thing. 
Would you send me your /etc/lvm/lvm.conf? 

Thanks Marc. 

----- Original Message ----- 
From: "Jorge Silva" < me...@je... > 
To: "Marc Grimme" < gr...@at... > 

Sent: Monday, November 12, 2012 4:50:19 PM 
Subject: Re: Problem with VG activation clvmd runs at 100% 

Marc 

Hi, please see below: 

[root@bwccs302 ~]# rpm -qa comoonics* 
comoonics-base-py-5.0-2_rhel6.noarch 
comoonics-bootimage-listfiles-rhel6-fencelib-5.0-1_rhel6.noarch 
comoonics-cdsl-py-5.0-3_rhel6.noarch 
comoonics-imsd-py-5.0-1_rhel6.noarch 
comoonics-cluster-py-5.0-2_rhel6.noarch 
comoonics-bootimage-listfiles-rhel6-5.0-4_rhel6.noarch 
comoonics-bootimage-imsd-5.0-5_rhel6.noarch 
comoonics-bootimage-listfiles-firmware-5.0-2_rhel6.noarch 
comoonics-release-5.0-2_rhel6.noarch 
comoonics-tools-py-5.0-2_rhel6.noarch 
comoonics-bootimage-initscripts-5.0-10_rhel6.noarch 
comoonics-imsd-plugins-py-5.0-1_rhel6.noarch 
comoonics-bootimage-extras-network-5.0-2_rhel6.noarch 
comoonics-cluster-tools-py-5.0-3_rhel6.noarch 
comoonics-bootimage-5.0-19_rhel6.noarch 
comoonics-bootimage-listfiles-rhel6-gfs2-5.0-3_rhel6.noarch 
comoonics-bootimage-extras-localconfigs-5.0-9_rhel6.noarch 
comoonics-bootimage-listfiles-all-5.0-4_rhel6.noarch 
comoonics-bootimage-extras-dm-multipath-rhel6-5.0-2_rhel6.noarch 

I only have 1 node up as I don't want to cause even more problems.. 

[root@bwccs302 ~]# cman_tool services 
fence domain 
member count 1 
victim count 0 
victim now 0 
master nodeid 2 
wait state none 
members 2 

dlm lockspaces 
name OSRoot 
id 0xab5404ad 
flags 0x00000008 fs_reg 
change member 1 joined 1 remove 0 failed 0 seq 1,1 
members 2 

gfs mountgroups 
name OSRoot 
id 0x659f7afe 
flags 0x00000048 mounted 
change member 1 joined 1 remove 0 failed 0 seq 1,1 
members 2 

clvmd -d - I have verified lvm.conf locking_type=3 

[root@bwccs302 ~]# clvmd -d 
CLVMD[f724c7a0]: Nov 12 10:41:44 CLVMD started 
CLVMD[f724c7a0]: Nov 12 10:41:44 Connected to CMAN 
CLVMD[f724c7a0]: Nov 12 10:41:44 CMAN initialisation complete 
CLVMD[f724c7a0]: Nov 12 10:41:44 Opened existing DLM lockspace for CLVMD. 
CLVMD[f724c7a0]: Nov 12 10:41:44 DLM initialisation complete 
CLVMD[f724c7a0]: Nov 12 10:41:44 Cluster ready, doing some more initialisation 
CLVMD[f724c7a0]: Nov 12 10:41:44 starting LVM thread 
CLVMD[f724b700]: Nov 12 10:41:44 LVM thread function started 
WARNING: Locking disabled. Be careful! This could corrupt your metadata. 
CLVMD[f724b700]: Nov 12 10:41:44 getting initial lock for T80WbeSaEpLbxfJKk9yfaeLc4HRSXD4gHE0iUJ3VIj83R3UAVmFkCUuijPMJaxxG 
CLVMD[f724b700]: Nov 12 10:41:44 sync_lock: 'T80WbeSaEpLbxfJKk9yfaeLc4HRSXD4gHE0iUJ3VIj83R3UAVmFkCUuijPMJaxxG' mode:1 flags=1 
CLVMD[f724b700]: Nov 12 10:41:44 hold_lock. lock at 1 failed: Invalid argument 
CLVMD[f724b700]: Nov 12 10:41:44 Failed to hold lock T80WbeSaEpLbxfJKk9yfaeLc4HRSXD4gHE0iUJ3VIj83R3UAVmFkCUuijPMJaxxG 
CLVMD[f724b700]: Nov 12 10:41:44 getting initial lock for T80WbeSaEpLbxfJKk9yfaeLc4HRSXD4gaC97Wx4e1LI52FZ0CiZfDmFnTxY64eh2 
CLVMD[f724b700]: Nov 12 10:41:44 sync_lock: 'T80WbeSaEpLbxfJKk9yfaeLc4HRSXD4gaC97Wx4e1LI52FZ0CiZfDmFnTxY64eh2' mode:1 flags=1 
CLVMD[f724b700]: Nov 12 10:41:44 hold_lock. lock at 1 failed: Invalid argument 
CLVMD[f724b700]: Nov 12 10:41:44 Failed to hold lock T80WbeSaEpLbxfJKk9yfaeLc4HRSXD4gaC97Wx4e1LI52FZ0CiZfDmFnTxY64eh2 
CLVMD[f724b700]: Nov 12 10:41:44 Sub thread ready for work. 
CLVMD[f724b700]: Nov 12 10:41:44 LVM thread waiting for work 
CLVMD[f724c7a0]: Nov 12 10:41:44 clvmd ready for work 
CLVMD[f724c7a0]: Nov 12 10:41:44 Using timeout of 60 seconds 

On Mon, Nov 12, 2012 at 10:33 AM, Marc Grimme < gr...@at... > wrote: 

Jorge, 
I would like to have a look at the following command outputs: 
rpm -qa comoonics* 

For all running nodes: 
cman_tool services 
ps axfwww | grep clvm 

Then try to start the clvmd manually and make it little chatty: 
clvmd -d 
And send me the output. 

Thanks Marc. 

----- Original Message ----- 
From: "Jorge Silva" < me...@je... > 
To: "Marc Grimme" < gr...@at... > 

Sent: Monday, November 12, 2012 4:19:37 PM 
Subject: Re: Problem with VG activation clvmd runs at 100% 

Marc 

Hi, I have serial console logging (appologies for not getting this done)and I can see all the cluster services starting as expected and properly. 

When I log on 

[root@bwccs302 ~]# clustat 
Cluster Status for ProdCluster01 @ Mon Nov 12 10:17:26 2012 
Member Status: Quorate 

Member Name ID Status 
------ ---- ---- ------ 
smc01a 1 Offline 
smc01b 2 Online, Local 
smc01c 3 Offline 
smc01d 4 Offline 
/dev/block/253:4 0 Online, Quorum Disk 

[root@bwccs302 ~]# vgdisplay 
cluster request failed: Invalid argument 
Can't get lock for VG_OSROOT 
cluster request failed: Invalid argument 
Can't get lock for vg_osroot 
[root@bwccs302 ~]# ls -al /dev/VG_OSROOT/LV_* 
lrwxrwxrwx 1 root root 7 Nov 12 10:14 /dev/VG_OSROOT/LV_ROOT -> ../dm-8 
lrwxrwxrwx 1 root root 7 Nov 12 10:14 /dev/VG_OSROOT/LV_SWAP -> ../dm-9 

On Mon, Nov 12, 2012 at 9:57 AM, Jorge Silva < me...@je... > wrote: 

Marc 

Hi, the problem is I believe I had set up an openshared root where more than one node boots up off a GFS2 fs. I believe I managed to get the system to that state, when I sent you the first screenshot Where the cluster would get started and clustered volumes would get detected in the comoonics boot stage. My problem was that once the boot process switched to the gfs fs, none of the clustered volumes were visible via any vg commands and any vg* commands failed - if I looked in the /dev/VG, I could see all the clustered volumes, however clvmd would be running at 100% any attempt to start it would fail. however all cluster resoureces and commands would work, cman_tool nodes would list nodes etc.. 

I have just rebuild the ramdisk and I notice that none of the cluster services has started, which is even more odd.. 

Thanks 
Jorge 

On Mon, Nov 12, 2012 at 9:12 AM, Marc Grimme < gr...@at... > wrote: 

This looks perfectly ok. 
The failed after activation of the vgs is because there are clustered vgs present (which again is perfectly ok). 
Then the bootup continues as expected as can be seen in the logs. 

I think I don't understand the problem you are talking about. 

Perhaps you could try to explain your problem in more detail. 

Thanks Marc. 

----- Original Message ----- 
From: "Jorge Silva" < me...@je... > 
To: "Marc Grimme" < gr...@at... > 

Sent: Monday, November 12, 2012 3:04:03 PM 
Subject: Re: Problem with VG activation clvmd runs at 100% 

Marc 

Hi, thanks for you help, I got rid of some of the clustered volumes for clarity, appologies for the unorthodox screen log, I must get console logging done via serial... 

I booted into emergency mode, and 

ls -l /etc/rc3.d/S* 
lrwxrwxrwx 1 root root 11 Nov 7 17:21 /etc/rc3.d/S99local -> ../rc.local.comoonics 

Edited 
rc.sysinit Line, on mine is line 205 and continued. 

Attached is output from set-x 

Thanks 
Jorge 

On Mon, Nov 12, 2012 at 6:01 AM, Marc Grimme < gr...@at... > wrote: 

Hi Jorge, 
try to boot the cluster into emergency mode by adding a "1" to the boot prompt. 
With this you should end up in a console. 
Then issue the following commands and send me the output: 
ls -l /etc/rc3.d/S* 

Also add the following line before lvm is started (rc.sysinit Line 199): 
+ 
set -x 
+ 
Then we should see more at the next bootup. 

Thanks 
Marc. 

----- Original Message ----- 
From: "Jorge Silva" < me...@je... > 
To: "Marc Grimme" < gr...@at... > 
Sent: Monday, November 12, 2012 4:43:22 AM 
Subject: Problem with VG activation clvmd runs at 100% 

Marc 

Hi, apologise for not getting back to you and it has been some time 
since we communicated. I am an Equity derivatives trader, and at the 
time I was helping a friend set up a trading Equity platform as a proof 
of concept, it was pretty low priority and was more of a learning tool 
for me, so I didn't spend too much time on it . I was forced to upgrade 
recently as this has moved from proof of concept to the next step. 

I apologise for bothering, but I have spent the last few days trying to 
get an OSR cluster running on Centos 6.3 +gfs2 and I believe I am almost 
there, but I am stuck, I am unsure what is going on. The cluster seems 
to be working ok, but climbed is running at 100% and I can restart it 
and still the same result. Attached is a screen shot of the final phase 
of booting showing the error. The cluster is quorate and shut-down to 
works OK. 

Thanks 
Jorge 
an output of vgscan: 
vgscan 
connect() failed on local socket: No such file or directory 
Internal cluster locking initialisation failed. 
WARNING: Falling back to local file-based locking. 
Volume Groups with the clustered attribute will be inaccessible. 
Reading all physical volumes. This may take a while... 
Skipping clustered volume group VG_OSROOT 
Found volume group "VG_DBDISKS" using metadata type lvm2 
Skipping clustered volume group VG_SDATA 
Found volume group "vg_osroot" using metadata type lvm2 

These are the lvm2 : 

lvm2-2.02.95-10.el6_3.2.x86_64 
lvm2-cluster-2.02.95-10.el6_3.2.x86_64 

I think these are what is causing the problem, but I'm not sure... 

lrwxrwxrwx 1 root root 41 Nov 11 21:54 /var/run/cman_admin -> 
/var/comoonics/chroot//var/run/cman_admin 
lrwxrwxrwx 1 root root 42 Nov 11 21:54 /var/run/cman_client -> 
/var/comoonics/chroot//var/run/cman_client 

I have tried the re-ordering /etc/cdsltab, it currently is : 

bind /.cluster/cdsl/%(nodeid)s/var/run /var/run __initrd 
bind /.cluster/cdsl/%(nodeid)s/var/lock /var/lock __initrd 

I have tried : 
rm -fr /var/cache/comoonics-bootimage/* ;rm -fr 
/var/cache/comoonics-repository/*; mkinitrd -V /boot/initrd_sr-$(uname 
-r).img $(uname -r) 

My cluster conf for the nodes look like: 
<clusternode name="smc01b" nodeid="2" votes="1"> 
<multicast addr="229.192.0.2" interface="bond0.1762"/> 
<fence> 
<method name="single"> 
<device ipaddr="172.17.50.16" name="ipmi"/> 
</method> 
</fence> 
<com_info> 
<eth mac="00:30:48:F0:10:54" master="bond0" 
name="eth0" slave="yes"/> 
<eth mac="00:30:48:F0:10:55" 
master="bond0" name="eth1" slave="yes"/> 
<eth name="bond0"> 
<properties> 
<property 
name="BONDING_OPTS">BONDING_OPTS="miimon=100 mode=4 xmit_hash_policy=2 
"</property> 
</properties> 
</eth> 
<eth name="bond0.1762" ip="172.17.60.12" 
mask="255.255.255.0" gateway=""> 
<properties> 
<property name="VLAN">VLAN="yes"</property> 
</properties> 
</eth> 
</com_info> 
</clusternode> 

I have tried re-installing the packages below: 
comoonics-base-py-5.0-2_rhel6.noarch 
comoonics-bootimage-listfiles-rhel6-fencelib-5.0-1_rhel6.noarch 
comoonics-cdsl-py-5.0-3_rhel6.noarch 
comoonics-imsd-py-5.0-1_rhel6.noarch 
comoonics-cluster-py-5.0-2_rhel6.noarch 
comoonics-bootimage-listfiles-rhel6-5.0-4_rhel6.noarch 
comoonics-bootimage-imsd-5.0-5_rhel6.noarch 
comoonics-bootimage-listfiles-firmware-5.0-2_rhel6.noarch 
comoonics-release-5.0-2_rhel6.noarch 
comoonics-tools-py-5.0-2_rhel6.noarch 
comoonics-bootimage-initscripts-5.0-10_rhel6.noarch 
comoonics-imsd-plugins-py-5.0-1_rhel6.noarch 
comoonics-bootimage-extras-network-5.0-2_rhel6.noarch 
comoonics-cluster-tools-py-5.0-3_rhel6.noarch 
comoonics-bootimage-5.0-19_rhel6.noarch 
comoonics-bootimage-listfiles-rhel6-gfs2-5.0-3_rhel6.noarch 
comoonics-bootimage-extras-localconfigs-5.0-9_rhel6.noarch 
comoonics-bootimage-listfiles-all-5.0-4_rhel6.noarch