From: Chan, S. (S. E. - SGI) <sam...@hp...> - 2024-03-12 01:11:30
|
maybe a bit more data from the crash dump might help. initially, i suggested that the problem might be due to scst_unregister_session() being called in parallel to scst_rx_mgmt_fn(). but now, i don't think that's the case. here's a better output of the stack trace... crash.bin> bt -t PID: 30180 TASK: ffff972e85e1b180 CPU: 1 COMMAND: "kworker/1:0" START: machine_kexec at ffffffffb2e66294 [ffff972d3e40b920] machine_kexec at ffffffffb2e66294 [ffff972d3e40b980] __crash_kexec at ffffffffb2f22562 [ffff972d3e40ba08] scst_rx_mgmt_fn at ffffffffc0ea96c1 [scst] [ffff972d3e40ba50] crash_kexec at ffffffffb2f22650 [ffff972d3e40ba68] oops_end at ffffffffb358b798 [ffff972d3e40ba90] die at ffffffffb2e30a7b [ffff972d3e40bac0] do_trap at ffffffffb358aee0 [ffff972d3e40bb10] do_invalid_op at ffffffffb2e2d2a4 [ffff972d3e40bb28] scst_rx_mgmt_fn at ffffffffc0ea96c1 [scst] [ffff972d3e40bbc0] invalid_op at ffffffffb35972ee [ffff972d3e40bc48] scst_rx_mgmt_fn at ffffffffc0ea96c1 [scst] [ffff972d3e40bc70] scst_rx_mgmt_fn at ffffffffc0ea96c1 [scst] [ffff972d3e40bcb8] vprintk_default at ffffffffb2e9e489 [ffff972d3e40bcf8] scst_unregister_session at ffffffffc0ea9771 [scst] [ffff972d3e40bd88] srpt_unregister_ch at ffffffffc0432389 [ib_srpt] [ffff972d3e40bda0] srpt_do_compl_work at ffffffffc0436228 [ib_srpt] [ffff972d3e40be20] process_one_work at ffffffffb2ebdc4f [ffff972d3e40be68] worker_thread at ffffffffb2ebed66 [ffff972d3e40bea8] worker_thread at ffffffffb2ebec40 [ffff972d3e40bec8] kthread at ffffffffb2ec5c21 [ffff972d3e40bf30] kthread at ffffffffb2ec5b50 [ffff972d3e40bf50] ret_from_fork_nospec_end at ffffffffb3593df7 [ffff972d3e40bf80] kthread at ffffffffb2ec5b50 so scst_unregister_session() seemed to have completed before scst_rx_mgmt_fn(). both of these functions are accessing scst_session. i found that to be this, if this helps... struct scst_session { init_phase = 0x3, tgt = 0xffff972fb56f0000, sess_tgt_priv = 0xffff972e5cebb000, sess_aflags = 0x0, tgt_dev_list_mutex = { count = { counter = 0x1 }, wait_lock = { { rlock = { raw_lock = { val = { counter = 0x0 } } } } }, wait_list = { next = 0xffff972c087a4d28, prev = 0xffff972c087a4d28 }, owner = 0x0, { osq = { tail = { counter = 0x0 } }, __UNIQUE_ID_rh_kabi_hide3 = { spin_mlock = 0x0 }, {<No data fields>} } }, sess_tgt_dev_list = {{ next = 0xffff972e776be540, prev = 0xffff972e776be540 }, { next = 0xffff972c087a4d58, prev = 0xffff972c087a4d58 }, { next = 0xffff972c087a4d68, prev = 0xffff972c087a4d68 }, { next = 0xffff972c087a4d78, prev = 0xffff972c087a4d78 }, { next = 0xffff972c087a4d88, prev = 0xffff972c087a4d88 }, { next = 0xffff972c087a4d98, prev = 0xffff972c087a4d98 }, { next = 0xffff972c087a4da8, prev = 0xffff972c087a4da8 }, { next = 0xffff972c087a4db8, prev = 0xffff972c087a4db8 }, { next = 0xffff972c087a4dc8, prev = 0xffff972c087a4dc8 }, { next = 0xffff972c087a4dd8, prev = 0xffff972c087a4dd8 }, { next = 0xffff972c087a4de8, prev = 0xffff972c087a4de8 }, { next = 0xffff972c087a4df8, prev = 0xffff972c087a4df8 }, { next = 0xffff972c087a4e08, prev = 0xffff972c087a4e08 }, { next = 0xffff972c087a4e18, prev = 0xffff972c087a4e18 }, { next = 0xffff972c087a4e28, prev = 0xffff972c087a4e28 }, { next = 0xffff972c087a4e38, prev = 0xffff972c087a4e38 }, { next = 0xffff972c087a4e48, prev = 0xffff972c087a4e48 }, { next = 0xffff972c087a4e58, prev = 0xffff972c087a4e58 }, { next = 0xffff972c087a4e68, prev = 0xffff972c087a4e68 }, { next = 0xffff972c087a4e78, prev = 0xffff972c087a4e78 }, { next = 0xffff972c087a4e88, prev = 0xffff972c087a4e88 }, { next = 0xffff972c087a4e98, prev = 0xffff972c087a4e98 }, { next = 0xffff972c087a4ea8, prev = 0xffff972c087a4ea8 }, { next = 0xffff972c087a4eb8, prev = 0xffff972c087a4eb8 }, { next = 0xffff972c087a4ec8, prev = 0xffff972c087a4ec8 }, { next = 0xffff972c087a4ed8, prev = 0xffff972c087a4ed8 }, { next = 0xffff972c087a4ee8, prev = 0xffff972c087a4ee8 }, { next = 0xffff972c087a4ef8, prev = 0xffff972c087a4ef8 }, { next = 0xffff972c087a4f08, prev = 0xffff972c087a4f08 }, { next = 0xffff972c087a4f18, prev = 0xffff972c087a4f18 }, { next = 0xffff972c087a4f28, prev = 0xffff972c087a4f28 }, { next = 0xffff972c087a4f38, prev = 0xffff972c087a4f38 }}, sess_cmd_list = { next = 0xffff972c087a4f80, prev = 0xffff972c087a4f80 }, sess_list_lock = { { rlock = { raw_lock = { val = { counter = 0x0 } } } } }, refcnt = { count = { counter = 0x8000000000000001 }, percpu_count_ptr = 0x2d1dc021cc03, release = 0xffffffffc0e85bb0 <scst_sess_release>, confirm_switch = 0xffffffffb318ac10, force_atomic = 0x0, rcu = { next = 0x0, func = 0xffffffffb318ad20 } }, sess_cmd_count = { counter = 0x1 }, io_stats = {{ cmd_count = 0x0, io_byte_count = 0x0, unaligned_cmd_count = 0x0 }, { cmd_count = 0x0, io_byte_count = 0x0, unaligned_cmd_count = 0x0 }, { cmd_count = 0x0, io_byte_count = 0x0, unaligned_cmd_count = 0x0 }, { cmd_count = 0x0, io_byte_count = 0x0, unaligned_cmd_count = 0x0 }, { cmd_count = 0x0, io_byte_count = 0x0, unaligned_cmd_count = 0x0 }}, acg = 0xffff972fafcd9c00, transport_id = 0xffff973022e57f40 "\004", acg_sess_list_entry = { next = 0xffff972c646ee660, prev = 0xffff972c087a1960 }, hw_pending_work = { work = { data = { counter = 0xfffffffe0 }, entry = { next = 0xffff972c087a5078, prev = 0xffff972c087a5078 }, func = 0xffffffffc0e7e540 <scst_hw_pending_work_fn> }, timer = { entry = { next = 0x0, prev = 0x0 }, expires = 0x0, base = 0xffff97304fd93942, function = 0xffffffffb2ebbe50, data = 0xffff972c087a5070, slack = 0xffffffff, start_pid = 0xffffffff, start_site = 0x0, start_comm = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" }, wq = 0x0, cpu = 0x0 }, initiator_name = 0xffff972e3761d600 "fe80:0000:0000:0000:e41d:2d03:001f:7c80", sess_name = 0xffff972e3761d680 "fe80:0000:0000:0000:e41d:2d03:001f:7c80_4", sess_list_entry = { next = 0xffff972c646ee700, prev = 0xffff972c087a1a00 }, sysfs_sess_list_entry = { next = 0xffff972c646ee710, prev = 0xffff972c087a1a10 }, sess_init_list_entry = { next = 0x0, prev = 0x0 }, sess_shut_list_entry = { next = 0x0, prev = 0x0 }, init_deferred_cmd_list = { next = 0xffff972c087a5140, prev = 0xffff972c087a5140 }, init_deferred_mcmd_list = { next = 0xffff972c087a5150, prev = 0xffff972c087a5150 }, shut_phase = 0x1, shutdown_compl = 0x0, sess_cm_list_id_list = { next = 0xffff972c087a5170, prev = 0xffff972c087a5170 }, sess_cm_list_id_cleanup_work = { work = { data = { counter = 0xfffffffe0 }, entry = { next = 0xffff972c087a5188, prev = 0xffff972c087a5188 }, func = 0xffffffffc0e73d60 <sess_cm_list_id_cleanup_work_fn> }, timer = { entry = { next = 0x0, prev = 0x0 }, expires = 0x0, base = 0xffff97304fd93942, function = 0xffffffffb2ebbe50, data = 0xffff972c087a5180, slack = 0xffffffff, start_pid = 0xffffffff, start_site = 0x0, start_comm = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" }, wq = 0x0, cpu = 0x0 }, sess_kobj_release_cmpl = 0x0, sess_mq = 0x0, sess_kobj_ready = 0x1, sess_kobj = { name = 0xffff972e3761d5c0 "fe80:0000:0000:0000:e41d:2d03:001f:7c80_4", entry = { next = 0xffff972c087a5218, prev = 0xffff972c087a5218 }, parent = 0xffff9730ca3bf340, kset = 0x0, ktype = 0xffffffffc0ed03c0 <scst_session_ktype>, { sd = 0xffff972ee02c9d98, __UNIQUE_ID_rh_kabi_hide7 = { sd = 0xffff972ee02c9d98 }, {<No data fields>} }, kref = { refcount = { counter = 0x3 } }, state_initialized = 0x1, state_in_sysfs = 0x1, state_add_uevent_sent = 0x0, state_remove_uevent_sent = 0x0, uevent_suppress = 0x0 }, lat_kobj = 0xffff972e3761d900, reg_sess_data = 0x0, init_result_fn = 0x0, unreg_done_fn = 0xffffffffc0433cc0 <srpt_unreg_sess>, lat_stats_lock = { { rlock = { raw_lock = { val = { counter = 0x0 } } } } }, lat_stats = 0x0 } additionally, there's an error message in the dmesg that is interesting.... [3966906.400273] ------------[ cut here ]------------ [3966906.400285] WARNING: CPU: 1 PID: 30180 at /root/rpmbuild/BUILD/scst-master/srpt/src/ib_srpt.c:2249 srpt_u nregister_ch+0x56/0x90 [ib_srpt] [3966906.400289] Modules linked in: ib_srpt(OE) scst_vdisk(OE) scst(OE) rdma_ucm(OE) ib_ucm(OE) rdma_cm(OE) iw _cm(OE) ib_ipoib(OE) ib_cm(OE) ib_umad(OE) mlx5_fpga_tools(OE) mlx4_en(OE) mlx4_ib(OE) mlx4_core(OE) vfat fat skx_edac nfit libnvdimm intel_powerclamp coretemp intel_rapl iosf_mbi sgi_xvm(POE) kvm irqbypass crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd pcspkr sgi_pm(POE) lpc_ich mei_me hpilo hpwdt sg mei wmi ipmi_si ipmi_devintf ipmi_msghandler sgi_r_pool(OE) acpi_power_meter sgi_os_lib(POE) kn em(OE) ip_tables xfs libcrc32c mlx5_ib(OE) ib_uverbs(OE) raid1 ib_core(OE) sd_mod crc_t10dif crct10dif_generic mgag200 i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm crct10dif_pclmul ahci c rct10dif_common crc32c_intel drm libahci mlx5_core(OE) [3966906.400379] nvme tg3 nvme_core libata mlxfw(OE) devlink mlx_compat(OE) ptp pps_core drm_panel_orientation_quirks uas usb_storage dm_mirror dm_region_hash dm_log dm_mod [3966906.400401] CPU: 1 PID: 30180 Comm: kworker/1:0 Kdump: loaded Tainted: P OE ------------ 3.10.0-1160.el7.x86_64 #1 [3966906.400404] Hardware name: HPE ProLiant DL360 Gen10/ProLiant DL360 Gen10, BIOS U32 11/13/2019 [3966906.400412] Workqueue: srpt srpt_do_compl_work [ib_srpt] [3966906.400415] Call Trace: [3966906.400432] [<ffffffffb3581340>] dump_stack+0x19/0x1b [3966906.400441] [<ffffffffb2e9b228>] __warn+0xd8/0x100 [3966906.400447] [<ffffffffb2e9b36d>] warn_slowpath_null+0x1d/0x20 [3966906.400453] [<ffffffffc04323a6>] srpt_unregister_ch+0x56/0x90 [ib_srpt] [3966906.400460] [<ffffffffc0436228>] srpt_do_compl_work+0x2c8/0x640 [ib_srpt] [3966906.400470] [<ffffffffb2ebdc4f>] process_one_work+0x17f/0x440 [3966906.400479] [<ffffffffb2ebed66>] worker_thread+0x126/0x3c0 [3966906.400486] [<ffffffffb2ebec40>] ? manage_workers.isra.26+0x2a0/0x2a0 [3966906.400492] [<ffffffffb2ec5c21>] kthread+0xd1/0xe0 [3966906.400498] [<ffffffffb2ec5b50>] ? insert_kthread_work+0x40/0x40 [3966906.400506] [<ffffffffb3593df7>] ret_from_fork_nospec_begin+0x21/0x21 [3966906.400512] [<ffffffffb2ec5b50>] ? insert_kthread_work+0x40/0x40 [3966906.400515] ---[ end trace 5d97197aa6829cf9 ]--- [3966906.400521] [30180]: scst: ***CRITICAL ERROR***: New mgmt cmd while shutting down the session ffff972c087a4d00 shut_phase 1 the last line is important. error is printed from here... 6915 6916 static int scst_post_rx_mgmt_cmd(struct scst_session *sess, 6917 struct scst_mgmt_cmd *mcmd) 6918 { 6919 unsigned long flags; 6920 int res = 0; 6921 6922 TRACE_ENTRY(); 6923 6924 if (unlikely(sess->shut_phase != SCST_SESS_SPH_READY)) { <--- 6925 PRINT_CRIT_ERROR("New mgmt cmd while shutting down the " 6926 "session %p shut_phase %ld", sess, sess->shut_phase); 6927 sBUG(); 6928 } 6929 the if() condition is checking sess->init_phase. from the structure above, that field is this... shut_phase = 0x1, <---- shutdown_compl = 0x0, sess_cm_list_id_list = { next = 0xffff972c087a5170, prev = 0xffff972c087a5170 }, 417 /* Set if session is initialized and ready */ 418 #define SCST_SESS_SPH_READY 0 419 if there's more from the crash dump i can provide, do let me know. thanks. -sam |