|
From: Thien H. <thi...@en...> - 2025-11-17 12:15:33
|
During network merge or split-brain recovery, IMM may be busy in the sync
phase and drop the set-implementer message. This causes imm agent to timeout
and later fail with SA_AIS_ERR_EXIST even after restart.
In case IMMND does not get a response from IMMD, or sending the response to
the agent fails, assume an implementer issue and allow the IMM agent to set
the implementer again.
---
src/imm/immnd/ImmModel.cc | 6 +-
src/imm/immnd/immnd_cb.h | 9 +++
src/imm/immnd/immnd_db.c | 119 +++++++++++++++++++++++++++++++++++++
src/imm/immnd/immnd_evt.c | 43 ++++++++++++--
src/imm/immnd/immnd_init.h | 7 ++-
5 files changed, 176 insertions(+), 8 deletions(-)
diff --git a/src/imm/immnd/ImmModel.cc b/src/imm/immnd/ImmModel.cc
index d0add57d8..d04107e22 100644
--- a/src/imm/immnd/ImmModel.cc
+++ b/src/imm/immnd/ImmModel.cc
@@ -1777,8 +1777,9 @@ SaAisErrorT immModel_implementerSet(IMMND_CB* cb,
}
SaAisErrorT immModel_implIsFree(IMMND_CB* cb, const IMMSV_OI_IMPLSET_REQ* req,
- SaUint32T* impl_id) {
+ SaUint32T* impl_id, bool* is_owner) {
*impl_id = 0;
+ *is_owner=false;
SaImmOiImplementerNameT impName = req->impl_name.buf;
std::string implName(impName);
if (implName.empty()) {
@@ -1803,8 +1804,9 @@ SaAisErrorT immModel_implIsFree(IMMND_CB* cb, const IMMSV_OI_IMPLSET_REQ* req,
/* Check for redundant request that comes from previously timed out client */
if (impl->mConn == m_IMMSV_UNPACK_HANDLE_HIGH(req->client_hdl) &&
impl->mNodeId == m_IMMSV_UNPACK_HANDLE_LOW(req->client_hdl)) {
- *impl_id = impl->mId;
+ *is_owner = true;
}
+ *impl_id = impl->mId;
return SA_AIS_ERR_EXIST;
}
diff --git a/src/imm/immnd/immnd_cb.h b/src/imm/immnd/immnd_cb.h
index 85e3663ca..20853548d 100644
--- a/src/imm/immnd/immnd_cb.h
+++ b/src/imm/immnd/immnd_cb.h
@@ -93,6 +93,13 @@ typedef struct immnd_fevs_msg_node {
struct immnd_fevs_msg_node *next;
} IMMND_FEVS_MSG_NODE;
+typedef struct immnd_impl_reply_pending {
+ char *buf;
+ SaUint32T size;
+ struct immnd_impl_reply_pending *next;
+} IMMND_IMPL_REPLY_PENDING;
+
+
/*****************************************************************************
* Data Structure used to hold IMMND control block
*****************************************************************************/
@@ -211,6 +218,8 @@ typedef struct immnd_cb_tag {
bool splitbrain_tmr_run;
uint8_t mFevsMaxPending; /* Max pending fevs messages towards director */
bool mSyncrTimeout;
+ IMMND_IMPL_REPLY_PENDING *impl_reply_pendings;
+ IMMND_IMPL_REPLY_PENDING *last_pendings;
} IMMND_CB;
/* CB prototypes */
diff --git a/src/imm/immnd/immnd_db.c b/src/imm/immnd/immnd_db.c
index 13009054b..d7b36a919 100644
--- a/src/imm/immnd/immnd_db.c
+++ b/src/imm/immnd/immnd_db.c
@@ -388,3 +388,122 @@ unsigned int immnd_dequeue_outgoing_fevs_msg(IMMND_CB *cb,
return cb->fevs_out_count;
}
+
+/***************************************************************************
+ * Name : immnd_implementer_pending_equal
+ *
+ * Description : Compare a node's (size, buf) with a given payload.
+ *
+ ***************************************************************************/
+inline bool immnd_implementer_pending_equal(const IMMND_IMPL_REPLY_PENDING *n,
+ const char *buf, SaUint32T size)
+{
+ if (!n || !buf)
+ return false;
+ return (n->size == size) && (memcmp(n->buf, buf, size) == 0);
+}
+
+/***************************************************************************
+ * Name : immnd_implementer_pending_exists
+ *
+ * Description : Check if implementer name entry exists in the global list.
+ *
+ ***************************************************************************/
+bool immnd_implementer_pending_exists(IMMND_CB *cb, IMMSV_OCTET_STRING *msg)
+{
+ for (IMMND_IMPL_REPLY_PENDING *n = cb->impl_reply_pendings; n;
+ n = n->next) {
+ if (immnd_implementer_pending_equal(n, msg->buf, msg->size))
+ return true;
+ }
+ return false;
+}
+
+/***************************************************************************
+ * Name : immnd_implementer_pending_add
+ *
+ * Description : Add a unique implementer name entry to the tail of
+ * the global list. Returns false if the entry already
+ * exists or OOM.
+ *
+ ***************************************************************************/
+bool immnd_implementer_pending_add(IMMND_CB *cb, IMMSV_OCTET_STRING *msg)
+{
+ if (immnd_implementer_pending_exists(cb, msg))
+ return false;
+
+ IMMND_IMPL_REPLY_PENDING *node = calloc(1, sizeof *node);
+ if (!node)
+ return false;
+
+ char *copy = strdup((const char *)msg->buf);
+ if (!copy)
+ return false;
+
+ node->buf = copy;
+ node->size = msg->size;
+ node->next = NULL;
+
+ if (!cb->impl_reply_pendings) {
+ cb->impl_reply_pendings = node;
+ cb->last_pendings = node;
+ } else {
+ cb->last_pendings->next = node;
+ cb->last_pendings = node;
+ }
+ return true;
+}
+
+/***************************************************************************
+ * Name : immnd_implementer_pending_remove
+ *
+ * Description : Remove the first entry equal to implementer name from the
+ * global list. Frees the node and payload on success.
+ *
+ ***************************************************************************/
+bool immnd_implementer_pending_remove(IMMND_CB *cb, IMMSV_OCTET_STRING *msg)
+{
+ IMMND_IMPL_REPLY_PENDING *prev = NULL;
+ IMMND_IMPL_REPLY_PENDING *cur = cb->impl_reply_pendings;
+
+ while (cur) {
+ if (immnd_implementer_pending_equal(cur, msg->buf,
+ msg->size)) {
+ IMMND_IMPL_REPLY_PENDING *next = cur->next;
+ if (prev)
+ prev->next = next;
+ else
+ cb->impl_reply_pendings = next;
+
+ if (cb->last_pendings == cur)
+ cb->last_pendings = prev;
+
+ free(cur->buf);
+ free(cur);
+ return true;
+ }
+ prev = cur;
+ cur = cur->next;
+ }
+ return false;
+}
+
+/***************************************************************************
+ * Name : immnd_implementer_pending_cleanup
+ *
+ * Description : Remove and free all entries from the global list
+ * (head and tail are reset to NULL).
+ *
+ ***************************************************************************/
+void immnd_implementer_pending_cleanup(IMMND_CB *cb)
+{
+ IMMND_IMPL_REPLY_PENDING *n = cb->impl_reply_pendings;
+ while (n) {
+ IMMND_IMPL_REPLY_PENDING *next = n->next;
+ free(n->buf);
+ free(n);
+ n = next;
+ }
+ cb->impl_reply_pendings = NULL;
+ cb->last_pendings = NULL;
+}
diff --git a/src/imm/immnd/immnd_evt.c b/src/imm/immnd/immnd_evt.c
index 46cb85b31..c1ea42220 100644
--- a/src/imm/immnd/immnd_evt.c
+++ b/src/imm/immnd/immnd_evt.c
@@ -2950,15 +2950,34 @@ static uint32_t immnd_evt_proc_impl_set(IMMND_CB *cb, IMMND_EVT *evt,
See finalizeSync #1871.
*/
SaUint32T impl_id;
+ bool is_owner = false;
send_evt.info.imma.info.implSetRsp.error =
- immModel_implIsFree(cb, &evt->info.implSet, &impl_id);
+ immModel_implIsFree(cb, &evt->info.implSet, &impl_id, &is_owner);
if (send_evt.info.imma.info.implSetRsp.error != SA_AIS_OK) {
- if (impl_id && send_evt.info.imma.info.implSetRsp.error ==
- SA_AIS_ERR_EXIST) {
+ if (is_owner && send_evt.info.imma.info.implSetRsp.error ==
+ SA_AIS_ERR_EXIST) {
/* Immediately respond OK to agent */
send_evt.info.imma.info.implSetRsp.error = SA_AIS_OK;
send_evt.info.imma.info.implSetRsp.implId = impl_id;
+ } else if (immnd_implementer_pending_exists(
+ cb, &(evt->info.implSet.impl_name))) {
+ TRACE_5("Discarding implementer id:%u", impl_id);
+ memset(&send_evt, '\0', sizeof(IMMSV_EVT));
+ send_evt.type = IMMSV_EVT_TYPE_IMMD;
+ send_evt.info.immd.type = IMMD_EVT_ND2D_DISCARD_IMPL;
+ send_evt.info.immd.info.impl_set.r.impl_id = impl_id;
+ if (immnd_mds_msg_send(cb, NCSMDS_SVC_ID_IMMD,
+ cb->immd_mdest_id, &send_evt) !=
+ NCSCC_RC_SUCCESS) {
+ LOG_ER("Discard implementer failed for "
+ "implId:%u. Client will be orphanded",
+ impl_id);
+ }
+ immModel_discardImplementer(cb, impl_id, false, NULL,
+ NULL);
+ send_evt.info.imma.info.implSetRsp.error =
+ SA_AIS_ERR_TRY_AGAIN;
}
goto agent_rsp;
}
@@ -3001,6 +3020,13 @@ static uint32_t immnd_evt_proc_impl_set(IMMND_CB *cb, IMMND_EVT *evt,
goto agent_rsp;
}
+ if (!immnd_implementer_pending_add(cb,
+ &(evt->info.implSet.impl_name))) {
+ TRACE("The implementer (%s) is already in the list of pending "
+ "replies.",
+ evt->info.implSet.impl_name.buf);
+ }
+
/*Save sinfo in continuation.
Note should set up a wait time for the continuation roughly in line
with IMMSV_WAIT_TIME.
@@ -11731,8 +11757,15 @@ static void immnd_evt_proc_impl_set_rsp(IMMND_CB *cb, IMMND_EVT *evt,
rc = immnd_mds_send_rsp(cb, &(cl_node->tmpSinfo), &send_evt);
if (rc != NCSCC_RC_SUCCESS) {
- LOG_WA(
- "Failed to send response to agent/client over MDS");
+ LOG_WA("Failed to send response to agent/client over "
+ "MDS");
+ } else {
+ if (!immnd_implementer_pending_remove(
+ cb, &(evt->info.implSet.impl_name))) {
+ LOG_WA("Failed to remove pending implementer "
+ "name: %s",
+ evt->info.implSet.impl_name.buf);
+ }
}
}
}
diff --git a/src/imm/immnd/immnd_init.h b/src/imm/immnd/immnd_init.h
index 704ea3ba9..80fb4a30d 100644
--- a/src/imm/immnd/immnd_init.h
+++ b/src/imm/immnd/immnd_init.h
@@ -472,7 +472,7 @@ void immModel_ccbAugmentAdmo(IMMND_CB *cb, SaUint32T adminOwnerId,
bool immModel_pbeNotWritable(IMMND_CB *cb);
SaAisErrorT immModel_implIsFree(IMMND_CB *cb, const IMMSV_OI_IMPLSET_REQ *req,
- SaUint32T *impl_id);
+ SaUint32T *impl_id, bool* is_owner);
SaAisErrorT immModel_resourceDisplay(
IMMND_CB *cb, const struct ImmsvAdminOperationParam *reqparams,
@@ -524,6 +524,11 @@ unsigned int immnd_enqueue_outgoing_fevs_msg(IMMND_CB *cb,
void dequeue_outgoing(IMMND_CB *cb);
+bool immnd_implementer_pending_exists(IMMND_CB *cb, IMMSV_OCTET_STRING *msg);
+bool immnd_implementer_pending_add(IMMND_CB *cb, IMMSV_OCTET_STRING *msg);
+bool immnd_implementer_pending_remove(IMMND_CB *cb, IMMSV_OCTET_STRING *msg);
+void immnd_implementer_pending_cleanup(IMMND_CB *cb);
+
/* End File : immnd_db.c */
/* File : --- immnd_mds.c */
--
2.34.1
The information in this email is confidential and may be legally privileged. It is intended solely for the addressee. Any opinions expressed are mine and do not necessarily represent the opinions of the Company. Emails are susceptible to interference. If you are not the intended recipient, any disclosure, copying, distribution or any action taken or omitted to be taken in reliance on it, is strictly prohibited and may be unlawful. If you have received this message in error, do not open any attachments but please notify the Endava Service Desk on (+44 (0)870 423 0187), and delete this message from your system. The sender accepts no responsibility for information, errors or omissions in this email, or for its use or misuse, or for any act committed or omitted in connection with this communication. If in doubt, please verify the authenticity of the contents with the sender. Please rely on your own virus checkers as no responsibility is taken by the sender for any damage rising out of any bug or virus infection.
Endava plc is a company registered in England under company number 5722669 whose registered office is at 125 Old Broad Street, London, EC2N 1AR, United Kingdom. Endava plc is the Endava group holding company and does not provide any services to clients. Each of Endava plc and its subsidiaries is a separate legal entity and has no liability for another such entity's acts or omissions.
|