From: Anders W. <and...@er...> - 2016-09-29 12:29:34
|
Ack with comments. One comment is that we ought to export this new admin op in the saClm.h file (or actually a new file separate file, like we have done when extending the SAF API in other services). I think we should also select a new number (i.e. 4) for this, even though it is targeted for a different object. So add e.g. the following line: SA_CLM_ADMIN_CLUSTER_RESET = 4 In the future, we should consider how this feature interact with the enhanced cluster management feature that is on the roadmap. For now I think it is good enough, but in the future I would like to have a two-phase shutdown to avoid the possibility that some of the nodes reboot very quickly and potentially could come up before some other slow node has shut down. See further code comments inline below, marked AndersW> regards, Anders Widell On 09/28/2016 01:55 PM, Hans Nordeback wrote: > osaf/libs/common/clmsv/include/clmsv_msg.h | 6 +++ > osaf/libs/core/common/include/osaf_utility.h | 5 +++ > osaf/libs/core/common/osaf_utility.c | 22 +++++++++++++ > osaf/services/saf/clmsv/clms/clms.h | 3 +- > osaf/services/saf/clmsv/clms/clms_imm.c | 18 ++++++++++ > osaf/services/saf/clmsv/clms/clms_mds.c | 46 +++++++++++++++++++++++++++- > osaf/services/saf/clmsv/clms/clms_util.c | 12 +++++++ > osaf/services/saf/clmsv/nodeagent/main.c | 12 +++++++ > scripts/opensaf_reboot | 22 ++++++++++--- > 9 files changed, 139 insertions(+), 7 deletions(-) > > > Admin command to request cluster reboot: > immadm -o 1 safCluster=myClmCluster > > diff --git a/osaf/libs/common/clmsv/include/clmsv_msg.h b/osaf/libs/common/clmsv/include/clmsv_msg.h > --- a/osaf/libs/common/clmsv/include/clmsv_msg.h > +++ b/osaf/libs/common/clmsv/include/clmsv_msg.h > @@ -23,6 +23,7 @@ typedef enum clms_msg_type { > CLMSV_CLMS_TO_CLMA_CBK_MSG, > CLMSV_CLMS_TO_CLMA_API_RESP_MSG, > CLMSV_CLMS_TO_CLMA_IS_MEMBER_MSG, > + CLMSV_CLMS_TO_CLMNA_REBOOT_MSG, > CLMSV_MSG_MAX > } CLMSV_MSG_TYPE; > > @@ -174,6 +175,10 @@ typedef struct clmsv_is_member_info_t { > SaUint32T client_id; > }CLMSV_IS_MEMBER_INFO; > > +typedef struct clmsv_reboot_info_t { > + SaClmNodeIdT node_id; > +} CLMSV_REBOOT_INFO; > + > /* Top Level CLMSv MDS message structure for use between CLMS-> CLMA && CLMA -> CLMS */ > typedef struct clmsv_msg_t { > struct clmsv_msg_t *next; /* Mailbox processing */ > @@ -183,6 +188,7 @@ typedef struct clmsv_msg_t { > CLMSV_CBK_INFO cbk_info; /* Callback Messages from CLMS to CLA */ > CLMSV_API_RESP_INFO api_resp_info; /* Response Messages from CLMS to CLA */ > CLMSV_IS_MEMBER_INFO is_member_info; /*Is node member or not Message from CLMS to CLA*/ > + CLMSV_REBOOT_INFO reboot_info; /* Reboot request from CLMS to CLMNA */ > } info; > } CLMSV_MSG; > > diff --git a/osaf/libs/core/common/include/osaf_utility.h b/osaf/libs/core/common/include/osaf_utility.h > --- a/osaf/libs/core/common/include/osaf_utility.h > +++ b/osaf/libs/core/common/include/osaf_utility.h > @@ -24,6 +24,8 @@ > #ifndef OPENSAF_CORE_OSAF_UTILITY_H_ > #define OPENSAF_CORE_OSAF_UTILITY_H_ > > +#define USE_SAFE_REBOOT 1 AndersW> Maybe instead use: enum { kUseSafeReboot = 1 }; > + > #include <pthread.h> > > #ifdef __cplusplus > @@ -68,6 +70,9 @@ extern void osaf_abort(long i_cause) > #endif > nothrow, noreturn)); > > +extern void osaf_safe_reboot() > + __attribute__ ((nothrow)); > + AndersW> Since this is C code, a function taking no parameter should be declared "void osaf_safe_reboot(void)" > static inline void osaf_mutex_lock_ordie(pthread_mutex_t* io_mutex) { > int result = pthread_mutex_lock(io_mutex); > if (result != 0) osaf_abort(result); > diff --git a/osaf/libs/core/common/osaf_utility.c b/osaf/libs/core/common/osaf_utility.c > --- a/osaf/libs/core/common/osaf_utility.c > +++ b/osaf/libs/core/common/osaf_utility.c > @@ -16,9 +16,12 @@ > */ > > #include "osaf_utility.h" > +#include "ncssysf_def.h" > +#include "configmake.h" AndersW> Include the project's header files after the system header files (except for "osaf_utility.h" of course, which should be at the top). > #include <stdlib.h> > #include <errno.h> > #include <syslog.h> > +#include <stdio.h> > > void osaf_abort(long i_cause) > { > @@ -26,3 +29,22 @@ void osaf_abort(long i_cause) > i_cause, __builtin_return_address(0), errno); > abort(); > } > + > +void osaf_safe_reboot() > +{ > + char str[256]; > + > + snprintf(str, sizeof(str), PKGLIBDIR "/opensaf_reboot %u %s %u", 0, "not_used", USE_SAFE_REBOOT); > + syslog(LOG_NOTICE, "Reboot ordered using command: %s", str); > + > + int rc = system(str); > + if (rc < 0) { AndersW> To strictly follow the man page for system(), we should check "if (rc == -1)" here, instead of "if (rc < 0)" > + syslog(LOG_CRIT, "Node reboot failure: exit code %d", WEXITSTATUS(rc)); AndersW> In case rc == -1, we shall not call WEXITSTATUS(rc). Instead we shall log the value of errno. > + } else { > + if (WIFEXITED(rc) && WEXITSTATUS(rc) == 0) { > + syslog(LOG_NOTICE, "Command: %s successfully executed", str); > + } else { > + syslog(LOG_CRIT, "Command: %s failed with exit code %d", str, WEXITSTATUS(rc)); AndersW> Not sure if it is a big deal, but WEXITSTATUS(rc) is only valid if WEXITED(rc) != 0. If e.g. WSIGNALED(rc) != 0, we should log WTERMSIG(rc) instead, etc. > + } > + } > +} > diff --git a/osaf/services/saf/clmsv/clms/clms.h b/osaf/services/saf/clmsv/clms/clms.h > --- a/osaf/services/saf/clmsv/clms/clms.h > +++ b/osaf/services/saf/clmsv/clms/clms.h > @@ -99,6 +99,7 @@ extern uint32_t clms_mds_msg_send(CLMS_C > MDS_DEST *dest, > MDS_SYNC_SND_CTXT *mds_ctxt, MDS_SEND_PRIORITY_TYPE prio, NCSMDS_SVC_ID svc_id); > > +extern uint32_t clms_mds_msg_bcast(CLMS_CB *cb, CLMSV_MSG *bcast_msg); > extern SaAisErrorT clms_imm_activate(CLMS_CB * cb); > extern uint32_t clms_node_trackresplist_empty(CLMS_CLUSTER_NODE * op_node); > extern uint32_t clms_send_cbk_start_sub(CLMS_CB * cb, CLMS_CLUSTER_NODE * node); > @@ -125,5 +126,5 @@ extern void clms_cb_dump(void); > extern uint32_t clms_send_is_member_info(CLMS_CB * cb, SaClmNodeIdT node_id, SaBoolT member, SaBoolT is_configured); > extern void clm_imm_reinit_bg(CLMS_CB * cb); > extern void proc_downs_during_rolechange (void); > - > +extern void clms_cluster_reboot(); AndersW> This is C code, so use (void) here. > #endif /* ifndef CLMS_H */ > diff --git a/osaf/services/saf/clmsv/clms/clms_imm.c b/osaf/services/saf/clmsv/clms/clms_imm.c > --- a/osaf/services/saf/clmsv/clms/clms_imm.c > +++ b/osaf/services/saf/clmsv/clms/clms_imm.c > @@ -19,6 +19,7 @@ > > #include "clms.h" > #include "osaf_extended_name.h" > +#include "osaf_utility.h" > > extern struct ImmutilWrapperProfile immutilWrapperProfile; > > @@ -886,6 +887,23 @@ static void clms_imm_admin_op_callback(S > > TRACE_ENTER2("Admin callback for nodename:%s, opId:%llu", objectName->value, opId); > > + // E.g. immadm -o 1 safCluster=myClmCluster > + if (strncmp(osaf_extended_name_borrow(objectName), > + osaf_extended_name_borrow(&osaf_cluster->name), > + osaf_extended_name_length(objectName)) == 0) { > + if (opId == 1) { > + LOG_WA("Cluster reboot requested. Ordering cluster reboot"); > + // MDS broadcast/multi cast call is synchronous > + clms_cluster_reboot(); > + sleep(1); AndersW> Use osaf_nanosleep(&kOneSecond); > + osaf_safe_reboot(); > + } else { > + LOG_ER("Admin Operation not supported for %s", osaf_extended_name_borrow(objectName)); > + immutil_saImmOiAdminOperationResult(immOiHandle, invocation, SA_AIS_ERR_INVALID_PARAM); > + } > + goto done; > + } > + > /*Lookup by the node_name and get the cluster node for CLM Admin oper */ > nodeop = clms_node_get_by_name(objectName); > if (nodeop == NULL) { > diff --git a/osaf/services/saf/clmsv/clms/clms_mds.c b/osaf/services/saf/clmsv/clms/clms_mds.c > --- a/osaf/services/saf/clmsv/clms/clms_mds.c > +++ b/osaf/services/saf/clmsv/clms/clms_mds.c > @@ -659,7 +659,17 @@ uint32_t clms_mds_enc(struct ncsmds_call > ncs_enc_claim_space(uba, 4); > total_bytes += 4; > > - if (CLMSV_CLMS_TO_CLMA_API_RESP_MSG == msg->evt_type) { > + if (CLMSV_CLMS_TO_CLMNA_REBOOT_MSG == msg->evt_type) { > + /* encode the reboot msg **/ > + p8 = ncs_enc_reserve_space(uba, 4); > + if (!p8) { > + TRACE("ncs_enc_reserve_space failed"); > + goto err; > + } > + ncs_encode_32bit(&p8, msg->info.reboot_info.node_id); > + ncs_enc_claim_space(uba, 4); > + total_bytes += 4; > + } else if (CLMSV_CLMS_TO_CLMA_API_RESP_MSG == msg->evt_type) { > /** encode the API RSP msg subtype **/ > p8 = ncs_enc_reserve_space(uba, 4); > if (!p8) { > @@ -1517,3 +1527,37 @@ uint32_t clms_mds_msg_send(CLMS_CB * cb, > TRACE_LEAVE(); > return rc; > } > + > +/**************************************************************************** > + Name : clms_mds_msg_bcast > + > + Description : This routine sends a broadcast message to CLMNA. > + > + Arguments : cb - ptr to the CLMA CB > + bcast_msg - ptr to the CLMSv broadcast message > + > + Return Values : NCSCC_RC_SUCCESS/NCSCC_RC_FAILURE > + > + Notes : None. > +******************************************************************************/ > +uint32_t clms_mds_msg_bcast(CLMS_CB *cb, CLMSV_MSG *bcast_msg) > +{ > + NCSMDS_INFO snd_mds = {0}; > + uint32_t rc; > + > + snd_mds.i_mds_hdl = cb->mds_hdl; > + snd_mds.i_svc_id = NCSMDS_SVC_ID_CLMS; > + snd_mds.i_op = MDS_SEND; > + snd_mds.info.svc_send.i_msg = (NCSCONTEXT)bcast_msg; > + snd_mds.info.svc_send.i_to_svc = NCSMDS_SVC_ID_CLMNA; > + snd_mds.info.svc_send.i_priority = MDS_SEND_PRIORITY_HIGH; > + snd_mds.info.svc_send.i_sendtype = MDS_SENDTYPE_BCAST; > + snd_mds.info.svc_send.info.bcast.i_bcast_scope = NCSMDS_SCOPE_NONE; > + > + if ((rc = ncsmds_api(&snd_mds)) != NCSCC_RC_SUCCESS) { > + LOG_ER("%s: ncsmds_api MDS_SEND failed %u", __FUNCTION__ ,rc); > + return rc; > + } > + > + return NCSCC_RC_SUCCESS; > +} > \ No newline at end of file > diff --git a/osaf/services/saf/clmsv/clms/clms_util.c b/osaf/services/saf/clmsv/clms/clms_util.c > --- a/osaf/services/saf/clmsv/clms/clms_util.c > +++ b/osaf/services/saf/clmsv/clms/clms_util.c > @@ -1200,3 +1200,15 @@ bool ip_matched(uint16_t family1, uint8_ > return true; > } > > +// > +void clms_cluster_reboot() > +{ > + CLMSV_MSG bcast_msg; > + bcast_msg.evt_type = CLMSV_CLMS_TO_CLMNA_REBOOT_MSG; > + bcast_msg.info.reboot_info.node_id = clms_cb->node_id; > + if (clms_mds_msg_bcast(clms_cb, &bcast_msg) == NCSCC_RC_SUCCESS) { > + LOG_NO("Sending cluster reboot broadcast message succeeded"); > + } else { > + LOG_ER("Sending cluster reboot broadcast message failed"); > + } > +} > diff --git a/osaf/services/saf/clmsv/nodeagent/main.c b/osaf/services/saf/clmsv/nodeagent/main.c > --- a/osaf/services/saf/clmsv/nodeagent/main.c > +++ b/osaf/services/saf/clmsv/nodeagent/main.c > @@ -114,6 +114,18 @@ static uint32_t clmna_mds_dec(struct ncs > total_bytes += 4; > > switch (msg->evt_type) { > + case CLMSV_CLMS_TO_CLMNA_REBOOT_MSG: AndersW> There is a tab character after "case". Replace with space. > + { > + p8 = ncs_dec_flatten_space(uba, local_data, 4); > + msg->info.reboot_info.node_id = ncs_decode_32bit(&p8); > + ncs_dec_skip_space(uba, 4); > + total_bytes += 4; > + // Reboot will be performed by CLMS for this node. > + if (clmna_cb->node_info.node_id != msg->info.reboot_info.node_id) { > + osaf_safe_reboot(); > + } > + break; > + } > case CLMSV_CLMS_TO_CLMA_API_RESP_MSG: > { > p8 = ncs_dec_flatten_space(uba, local_data, 8); > diff --git a/scripts/opensaf_reboot b/scripts/opensaf_reboot > --- a/scripts/opensaf_reboot > +++ b/scripts/opensaf_reboot > @@ -40,10 +40,17 @@ NODE_ID_FILE=$pkglocalstatedir/node_id > > node_id=$1 > ee_name=$2 > +safe_reboot=$3 > > # Run commands through sudo when not superuser > test $(id -u) -ne 0 && icmd=$(which sudo 2> /dev/null) > > +opensaf_safe_reboot() > +{ > + logger -t "opensaf_reboot" "Rebooting local node using shutdown" > + $icmd /sbin/shutdown -r now > +} > + > ## Use stonith for remote fencing > opensaf_reboot_with_remote_fencing() > { > @@ -91,8 +98,12 @@ temp_node_id=`cat "$NODE_ID_FILE"` > temp_node_id=`echo "$temp_node_id" |sed -e 's:^0[bBxX]::'| sed -e 's:^:0x:'` > self_node_id=`printf "%d" $temp_node_id` > > -# A node ID of zero(0) means an order to reboot the local node > -if [ "$self_node_id" = "$node_id" ] || [ $node_id = 0 ]; then > + > +if [ "$safe_reboot" = 1 ]; then > + opensaf_safe_reboot > +else > + # A node ID of zero(0) means an order to reboot the local node > + if [ "$self_node_id" = "$node_id" ] || [ $node_id = 0 ]; then > # uncomment the following line if debugging errors that keep restarting the node > # exit 0 > > @@ -114,8 +125,8 @@ if [ "$self_node_id" = "$node_id" ] || [ > > # Reboot (not shutdown) system WITH file system sync > $icmd /sbin/reboot -f > -else > - if [ "$FMS_USE_REMOTE_FENCING" = "1" ]; then > + else > + if [ "$FMS_USE_REMOTE_FENCING" = 1 ]; then > opensaf_reboot_with_remote_fencing > else > if [ ":$ee_name" != ":" ]; then > @@ -133,4 +144,5 @@ else > logger -t "opensaf_reboot" "Rebooting remote node in the absence of PLM is outside the scope of OpenSAF" > fi > fi > -fi > + fi > +fi > \ No newline at end of file |