#50 Root Node seemed to hang an fall out of cluster

closed-fixed
nobody
Networking (12)
5
2005-08-26
2004-04-22
No

The root node hung and was pulled out of the cluster
(node 2 did take over as CLMS master and the rest of
the 14 nodes were ok). kswapd seemed to be waiting on
a lock.

Process 12 kswapd
kernel .text 0xc0100000
0xc0129100
0xc0129260
0xc277fc78 0xc012718b schedule+0x30b (0x1, 0xc277e000,
0xdb4dc5dc,
0xdb4dc5dc, 0
x0)
kernel .text 0xc0100000
0xc0126e80
0xc0127390
0xc277fcac 0xc010b1a3 __down+0x73 (0xdb4dc5cc,
0xdf2a1b00, 0xdb4dc580)
kernel .text 0xc0100000
0xc010b130
0xc010b200
0xc277fcc0 0xc010b37f __down_failed+0xb (0x246, 0x4,
0xc277fe58,
0xc277fd7c, 0x0
)
kernel .text 0xc0100000
0xc010b374
0xc010b388
0xc0295443 .text.lock.ics_llcli+0x19
kernel .text 0xc0100000
0xc029542a
0xc0295570
0xc277fcf0 0xc0294a86 icscli_llsend+0x196 (0xdf6e6400,
0xcd0d7780,
0x98, 0x1, 0x
a)
kernel .text 0xc0100000
0xc02948f0
0xc0294d50
0xc277fd18 0xc028e981 icscli_send+0x141 (0xdf6e6400,
0xe, 0xc028e9b0,
0x0, 0xc27
7e000)
kernel .text 0xc0100000
0xc028e840
0xc028e9b0
0xc277fd3c 0xc0299d64 cli_icsnsc_rcall+0x154 (0x3,
0xc277fd70, 0xa0001,
0xe, 0x0
)
kernel .text 0xc0100000
0xc0299c10
0xc0299dc0
0xc277fda8 0xc029a26a nsc_rcall+0xea (0x10a0003, 0xe,
0xc0261790,
0xc277fe14, 0x
c 260490)
[0]more> kernel .text 0xc0100000
0xc029a180
0xc029a310
0xc277fec4 0xc024f317 msgsend+0x157 (0xd70eb880, 0x298,
0x1001, 0x2,
0x0) kernel .text
0xc0100000
0xc024f1c0 0xc024f370
0xc277fefc 0xc024f497 process_msgs+0x127 (0x1,
0xd39c6d04, 0x1, 0x1,
0xd7225800)
kernel .text 0xc0100000
0xc024f370
0xc024f4eb
0xc277ff30 0xc026572f cfstok_giveback+0x6f (0xd7225800,
0xd7225800,
0xc277ff80)
kernel .text 0xc0100000
0xc02656c0
0xc0265770
0xc277ff44 0xc0254f1b cfs_clear_inode+0x1b (0xd7225800,
0xd7225800)
kernel .text 0xc0100000
0xc0254f00
0xc0254f60
0xc277ff54 0xc01855b3 clear_inode+0x103 (0xd7225800,
0xc277ff78,
0xc0182a90, 0xc
058f324, 0x47b)
kernel .text 0xc0100000
0xc01854b0
0xc01855f0
0xc277ff70 0xc018563c dispose_list+0x4c (0xc277ff80,
0x15, 0xd7225308,
0xcfb3908
8, 0xc058a900)
kernel .text 0xc0100000
0xc01855f0
0xc0185680
0xc277ff94 0xc0185a0b prune_icache+0x8b (0x47b, 0x40)
kernel .text 0xc0100000
0xc0185980
0xc0185bc0
0xc277ffa4 0xc0185be5 shrink_icache_memory+0x25 (0x6,
0x1d0, 0x14, 0x0,
0x0)
kernel .text 0xc0100000
0xc0185bc0
0xc0185c30
0xc277ffc8 0xc015ad22 do_try_to_free_pages_kswapd+0x142
(0x1d0, 0x2,
0x1d0, 0x0,
0xc015aed0)
kernel .text 0xc0100000
0xc015abe0
0xc015add0
0xc277ffec 0xc015af8b kswapd+0xbb
kernel .text 0xc0100000
0xc015aed0
0xc015b100
0xc0109989 kernel_thread_helper+0x5

Stack traceback for pid 65642
0xdefd8000 65642 2 0 3 D 0xdefd8570
cm nodedwn dmn
EBP EIP Function (args)
0xdefd9eb0 0xc01291a4 context_switch+0xa4 (0xc0628c80,
0xdefd8000,
0xd4414000, 0
xdefd9ef4, 0xc02105c0)
kernel .text 0xc0100000
0xc0129100
0xc0129260
0xdefd9ef4 0xc012718b schedule+0x30b (0x0, 0xdefd8000,
0x0, 0x0, 0x0)
kernel .text 0xc0100000
0xc0126e80
0xc0127390
0xdefd9f48 0xc0321b67 __lock_sock+0x87 (0xddaccd00,
0xdefd9f6c, 0x0,
0xdb4dc580,
0xdf2a1b80)
kernel .text 0xc0100000
0xc0321ae0
0xc0321bb0
0xdefd9f68 0xc0371cb8 inet_shutdown+0x1e8 (0xdefede40,
0x2, 0x1, 0x1,
0xdf2a1b8c
)
kernel .text 0xc0100000
0xc0371ad0
0xc0371cc0
0xdefd9f90 0xc0291b66 ics_llnodedown_async_start+0x126
(0x3, 0x3)
kernel .text 0xc0100000
0xc0291a40
0xc0291c30
0xdefd9fa0 0xc0291a24 ics_llnodedown+0x14 (0x3, 0x20,
0xc675e4c0)
kernel .text 0xc0100000
0xc0291a10
0xc0291a40
0xdefd9fb4 0xc028e017 ics_nodedown+0x57 (0x3,
0xc675e4c0, 0x0, 0x0, 0x0)
kernel .text 0xc0100000
0xc028dfc0
0xc028e080
0xdefd9fec 0xc0283fff clms_master_nodedown_daemon+0x13f
kernel .text 0xc0100000
0xc0283ec0
0xc0284050
0xc0109989 kernel_thread_helper+0x5
kernel .text 0xc0100000
0xc0109984
0xc0109990

0xdefd4000 65644 2 0 2 D 0xdefd4570
nm mas send dae
EBP EIP Function (args)
0xdefd5e18 0xc01291a4 context_switch+0xa4 (0xc0628280,
0xdefd4000,
0xd48f4000, 0
x286, 0xdefd5ef8)
kernel .text 0xc0100000
0xc0129100
0xc0129260
0xdefd5e5c 0xc012718b schedule+0x30b (0x0, 0x1, 0x0,
0xdefd4000, 0x0)
kernel .text 0xc0100000
0xc0126e80
0xc0127390
0xdefd5eb4 0xc015b1e2 wakeup_kswapd+0xe2 (0x1f0, 0x0,
0x2, 0x1, 0x0)
kernel .text 0xc0100000
0xc015b100
0xc015b1f0
0xdefd5efc 0xc015d086 __alloc_pages+0x106
kernel .text 0xc0100000
0xc015cf80
0xc015d320
0xdefd5f04 0xc015ce36 _alloc_pages+0x16
kernel .text 0xc0100000
0xc015ce20
0xc015ce40
0xdefd5f0c 0xc015d347 __get_free_pages+0x27
(0xc0619200, 0xd48f4570,
0xd48f4000,
0xd48f4000, 0x1)
kernel .text 0xc0100000
0xc015d320
0xc015d350
0xdefd5f38 0xc01561a5 kmem_cache_grow+0xc5 (0xc2738bf0,
0x1f0, 0x1f0,
0xc012718b
, 0x0)
kernel .text 0xc0100000
0xc01560e0
0xc0156370
0xdefd5f5c 0xc015709f __kmem_cache_alloc+0x6f
(0xc2738bf0, 0x1f0, 0x2)
kernel .text 0xc0100000
0xc0157030
0xc0157170
0xdefd5f70 0xc0156519 kmalloc+0x49 (0x604, 0x1f0,
0xdefd5fc8,
0xc85fbc00, 0x4a62
f80)
kernel .text 0xc0100000
0xc01564d0
0xc0156520
0xdefd5fb0 0xc0292090 ics_llunack_send+0x90 (0x2, 0x3,
0xc06c5fc0,
0x5e8)
kernel .text 0xc0100000
0xc0292000
0xc02921d0
0xdefd5fc8 0xc028e366 ics_unack_send+0x26 (0x2, 0x3,
0xc06c5fc0, 0x5e8,
0x3)
kernel .text 0xc0100000
0xc028e340
0xc028e370
0xdefd5fec 0xc02a032f nm_master_send_daemon+0xef
kernel .text 0xc0100000
0xc02a0240
0xc02a0380
0xc0109989 kernel_thread_helper+0x5
kernel .text 0xc0100000
0xc0109984
0xc0109990
0xdee24000 1 0 0 0 D 0xdee24570 init
EBP EIP Function (args)
0xdee25d40 0xc01291a4 context_switch+0xa4 (0xc0628280,
0xdee24000,
0xd48f4000, 0
x286, 0x0)
kernel .text 0xc0100000
0xc0129100
0xc0129260
0xdee25d84 0xc012718b schedule+0x30b (0x0, 0x1, 0x0,
0xdee24000, 0x0)
kernel .text 0xc0100000
0xc0126e80
0xc0127390
0xdee25ddc 0xc015b1e2 wakeup_kswapd+0xe2 (0x1f0, 0x0,
0x2, 0x1, 0x0)
kernel .text 0xc0100000
0xc015b100
0xc015b1f0
0xdee25e24 0xc015d086 __alloc_pages+0x106
kernel .text 0xc0100000
0xc015cf80
0xc015d320
0xdee25e2c 0xc015ce36 _alloc_pages+0x16
kernel .text 0xc0100000
0xc015ce20
0xc015ce40
0xdee25e34 0xc015d347 __get_free_pages+0x27
(0xded53a80, 0xc269ee00,
0x145)
kernel .text 0xc0100000
0xc015d320
0xc015d350
0xdee25e48 0xc017e0cd __pollwait+0x2d (0xc269ee00,
0xdefc8880,
0xdee25f28, 0xc26
9ee00, 0xd9b4e000)
kernel .text 0xc0100000
0xc017e0a0
0xc017e170
0xdee25e64 0xc01763f6 pipe_poll+0x36 (0xc269ee00,
0xdee25f28,
0xc269ee00, 0x0, 0
x0)
kernel .text 0xc0100000
0xc01763c0
0xc0176440
0xdee25e90 0xc0236fa7 ssidev_do_pollfd+0x107 (0x0, 0x1,
0xd9b4e000,
0xdee25f28,
0xcf737308)
kernel .text 0xc0100000
0xc0236ea0
0xc0236fb0
[0xdee25e48 0xc017e0cd __pollwait+0x2d (0xc269ee00,
0xdefc8880,
0xdee25f28, 0xc26
9ee00, 0xd9b4e000)
kernel .text 0xc0100000
0xc017e0a0
0xc017e170
0xdee25e64 0xc01763f6 pipe_poll+0x36 (0xc269ee00,
0xdee25f28,
0xc269ee00, 0x0, 0
x0)
kernel .text 0xc0100000
0xc01763c0
0xc0176440
0xdee25e90 0xc0236fa7 ssidev_do_pollfd+0x107 (0x0, 0x1,
0xd9b4e000,
0xdee25f28,
0xcf737308)
kernel .text 0xc0100000
0xc0236ea0
0xc0236fb0
[0]more>

0xdee25ed0 0xc023725c ssidev_do_poll_node+0x9c
(0xcf737308, 0x0,
0xdee25f28, 0xd
ee25f90, 0xc4db9e00)
kernel .text 0xc0100000
0xc02371c0
0xc02372c0
0xdee25f10 0xc0237480 ssidev_do_poll_nodes+0x1c0
(0xdf6de800,
0xdee25f28, 0xdee2
5f84, 0xdf6de800, 0x0)
kernel .text 0xc0100000
0xc02372c0
0xc0237600
0xdee25f44 0xc017e50f do_select+0xdf (0xb, 0xdee25f90,
0xdee25f84,
0x2a, 0x1)
kernel .text 0xc0100000
0xc017e430
0xc017e590
0xdee25fbc 0xc017e9b4 sys_select+0x3d4
kernel .text 0xc0100000
0xc017e5e0
0xc017ed40
0xc05de06d no_timing+0x7
kernel .entry.text
0xc05de000 0xc05de066
0xc05de0
74

Discussion

  • David Zafman

    David Zafman - 2004-05-10
    • labels: 375979 --> Networking
    • assigned_to: dzafman --> nobody
     
  • David Zafman

    David Zafman - 2004-05-10

    Logged In: YES
    user_id=297844

    This looks like it could be an icscli_send() deadlocking
    with inet_shutdown() because of a nodedown. Maybe
    inet_shutdown() needs to wake processes up that might be
    trying to send to the down node, so it can get the lock and
    allow them to get an error.

     
  • Scott Hinchley

    Scott Hinchley - 2004-05-10

    Logged In: YES
    user_id=790605

    Since adding John's CFS leak patch
    (http://marc.theaimsgroup.com/?l=ssic-linux-users&m=108335750309680&w=2)
    this cluster has run over a week, prior to that if failed
    with in 24 hours.

     
  • Brian J. Watson

    Brian J. Watson - 2004-05-27
    • priority: 7 --> 5
     
  • Roger Tsang

    Roger Tsang - 2005-08-26
    • status: open --> closed-fixed
     

Get latest updates about Open Source Projects, Conferences and News.

Sign up for the SourceForge newsletter:





No, thanks