tipc-discussion Mailing List for TIPC Cluster Domain Sockets (Page 47)

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

In conjunction with changing the interfaces' MTU (e.g. especially in
the case of a bonding) where the TIPC links are brought up and down
in a short time, a couple of issues were detected with the current link
changeover mechanism:

1) When one link is up but immediately forced down again, the failover
procedure will be carried out in order to failover all the messages in
the link's transmq queue onto the other working link. The link and node
state is also set to FAILINGOVER as part of the process. The message
will be transmited in form of a FAILOVER_MSG, so its size is plus of 40
bytes (= the message header size). There is no problem if the original
message size is not larger than the link's MTU - 40, and indeed this is
the max size of a normal payload messages. However, in the situation
above, because the link has just been up, the messages in the link's
transmq are almost SYNCH_MSGs which had been generated by the link
synching procedure, then their size might reach the max value already!
When the FAILOVER_MSG is built on the top of such a SYNCH_MSG, its size
will exceed the link's MTU. As a result, the messages are dropped
silently and the failover procedure will never end up, the link will
not be able to exit the FAILINGOVER state, so cannot be re-established.

2) The same scenario above can happen more easily in case the MTU of
the links is set differently or when changing. In that case, as long as
a large message in the failure link's transmq queue was built and
fragmented with its link's MTU > the other link's one, the issue will
happen (there is no need of a link synching in advance).

3) The link synching procedure also faces with the same issue but since
the link synching is only started upon receipt of a SYNCH_MSG, dropping
the message will not result in a state deadlock, but it is not expected
as design.

The 1) & 3) issues are resolved by the last commit that only a dummy
SYNCH_MSG (i.e. without data) is generated at the link synching, so the
size of a FAILOVER_MSG if any then will never exceed the link's MTU.

For the 2) issue, the only solution is trying to fragment the messages
in the failure link's transmq queue according to the working link's MTU
so they can be failovered then. A new function is made to accomplish
this, it will still be a TUNNEL PROTOCOL/FAILOVER MSG but if the
original message size is too large, it will be fragmented & reassembled
at the receiving side.

Acked-by: Ying Xue <yin...@wi...>
Acked-by: Jon Maloy <jon...@er...>
Signed-off-by: Tuong Lien <tuo...@de...>
---
 net/tipc/link.c | 93 ++++++++++++++++++++++++++++++++++++++++++++++++---------
 net/tipc/msg.c  | 59 ++++++++++++++++++++++++++++++++++++
 net/tipc/msg.h  | 18 ++++++++++-
 3 files changed, 155 insertions(+), 15 deletions(-)

diff --git a/net/tipc/link.c b/net/tipc/link.c
index e215b4ba6a4b..2c274777b2dd 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -180,6 +180,7 @@ struct tipc_link {
 
 	/* Fragmentation/reassembly */
 	struct sk_buff *reasm_buf;
+	struct sk_buff *reasm_tnlmsg;
 
 	/* Broadcast */
 	u16 ackers;
@@ -897,8 +898,10 @@ void tipc_link_reset(struct tipc_link *l)
 	l->backlog[TIPC_CRITICAL_IMPORTANCE].len = 0;
 	l->backlog[TIPC_SYSTEM_IMPORTANCE].len = 0;
 	kfree_skb(l->reasm_buf);
+	kfree_skb(l->reasm_tnlmsg);
 	kfree_skb(l->failover_reasm_skb);
 	l->reasm_buf = NULL;
+	l->reasm_tnlmsg = NULL;
 	l->failover_reasm_skb = NULL;
 	l->rcv_unacked = 0;
 	l->snd_nxt = 1;
@@ -940,6 +943,9 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
 	int rc = 0;
 
 	if (unlikely(msg_size(hdr) > mtu)) {
+		pr_warn("Too large msg, purging xmit list %d %d %d %d %d!\n",
+			skb_queue_len(list), msg_user(hdr),
+			msg_type(hdr), msg_size(hdr), mtu);
 		skb_queue_purge(list);
 		return -EMSGSIZE;
 	}
@@ -1233,6 +1239,7 @@ static int tipc_link_tnl_rcv(struct tipc_link *l, struct sk_buff *skb,
 			     struct sk_buff_head *inputq)
 {
 	struct sk_buff **reasm_skb = &l->failover_reasm_skb;
+	struct sk_buff **reasm_tnlmsg = &l->reasm_tnlmsg;
 	struct sk_buff_head *fdefq = &l->failover_deferdq;
 	struct tipc_msg *hdr = buf_msg(skb);
 	struct sk_buff *iskb;
@@ -1240,40 +1247,56 @@ static int tipc_link_tnl_rcv(struct tipc_link *l, struct sk_buff *skb,
 	int rc = 0;
 	u16 seqno;
 
-	/* SYNCH_MSG */
-	if (msg_type(hdr) == SYNCH_MSG)
-		goto drop;
+	if (msg_type(hdr) == SYNCH_MSG) {
+		kfree_skb(skb);
+		return 0;
+	}
 
-	/* FAILOVER_MSG */
-	if (!tipc_msg_extract(skb, &iskb, &ipos)) {
-		pr_warn_ratelimited("Cannot extract FAILOVER_MSG, defq: %d\n",
-				    skb_queue_len(fdefq));
-		return rc;
+	/* Not a fragment? */
+	if (likely(!msg_nof_fragms(hdr))) {
+		if (unlikely(!tipc_msg_extract(skb, &iskb, &ipos))) {
+			pr_warn_ratelimited("Unable to extract msg, defq: %d\n",
+					    skb_queue_len(fdefq));
+			return 0;
+		}
+		kfree_skb(skb);
+	} else {
+		/* Set fragment type for buf_append */
+		if (msg_fragm_no(hdr) == 1)
+			msg_set_type(hdr, FIRST_FRAGMENT);
+		else if (msg_fragm_no(hdr) < msg_nof_fragms(hdr))
+			msg_set_type(hdr, FRAGMENT);
+		else
+			msg_set_type(hdr, LAST_FRAGMENT);
+
+		if (!tipc_buf_append(reasm_tnlmsg, &skb)) {
+			/* Successful but non-complete reassembly? */
+			if (*reasm_tnlmsg || link_is_bc_rcvlink(l))
+				return 0;
+			pr_warn_ratelimited("Unable to reassemble tunnel msg\n");
+			return tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
+		}
+		iskb = skb;
 	}
 
 	do {
 		seqno = buf_seqno(iskb);
-
 		if (unlikely(less(seqno, l->drop_point))) {
 			kfree_skb(iskb);
 			continue;
 		}
-
 		if (unlikely(seqno != l->drop_point)) {
 			__tipc_skb_queue_sorted(fdefq, seqno, iskb);
 			continue;
 		}
 
 		l->drop_point++;
-
 		if (!tipc_data_input(l, iskb, inputq))
 			rc |= tipc_link_input(l, iskb, inputq, reasm_skb);
 		if (unlikely(rc))
 			break;
 	} while ((iskb = __tipc_skb_dequeue(fdefq, l->drop_point)));
 
-drop:
-	kfree_skb(skb);
 	return rc;
 }
 
@@ -1663,15 +1686,18 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl,
 	struct sk_buff *skb, *tnlskb;
 	struct tipc_msg *hdr, tnlhdr;
 	struct sk_buff_head *queue = &l->transmq;
-	struct sk_buff_head tmpxq, tnlq;
+	struct sk_buff_head tmpxq, tnlq, frags;
 	u16 pktlen, pktcnt, seqno = l->snd_nxt;
+	bool pktcnt_need_update = false;
 	u16 syncpt;
+	int rc;
 
 	if (!tnl)
 		return;
 
 	skb_queue_head_init(&tnlq);
 	skb_queue_head_init(&tmpxq);
+	skb_queue_head_init(&frags);
 
 	/* At least one packet required for safe algorithm => add dummy */
 	skb = tipc_msg_create(TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG,
@@ -1727,6 +1753,39 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl,
 		if (queue == &l->backlogq)
 			msg_set_seqno(hdr, seqno++);
 		pktlen = msg_size(hdr);
+
+		/* Tunnel link MTU is not large enough? This could be
+		 * due to:
+		 * 1) Link MTU has just changed or set differently;
+		 * 2) Or FAILOVER on the top of a SYNCH message
+		 *
+		 * The 2nd case should not happen if peer supports
+		 * TIPC_TUNNEL_ENHANCED
+		 */
+		if (pktlen > tnl->mtu - INT_H_SIZE) {
+			if (mtyp == FAILOVER_MSG &&
+			    (tnl->peer_caps & TIPC_TUNNEL_ENHANCED)) {
+				rc = tipc_msg_fragment(skb, &tnlhdr, tnl->mtu,
+						       &frags);
+				if (rc) {
+					pr_warn("%sunable to frag msg: rc %d\n",
+						link_co_err, rc);
+					return;
+				}
+				pktcnt += skb_queue_len(&frags) - 1;
+				pktcnt_need_update = true;
+				skb_queue_splice_tail_init(&frags, &tnlq);
+				continue;
+			}
+			/* Unluckily, peer doesn't have TIPC_TUNNEL_ENHANCED
+			 * => Just warn it and return!
+			 */
+			pr_warn_ratelimited("%stoo large msg <%d, %d>: %d!\n",
+					    link_co_err, msg_user(hdr),
+					    msg_type(hdr), msg_size(hdr));
+			return;
+		}
+
 		msg_set_size(&tnlhdr, pktlen + INT_H_SIZE);
 		tnlskb = tipc_buf_acquire(pktlen + INT_H_SIZE, GFP_ATOMIC);
 		if (!tnlskb) {
@@ -1742,6 +1801,12 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl,
 		goto tnl;
 	}
 
+	if (pktcnt_need_update)
+		skb_queue_walk(&tnlq, skb) {
+			hdr = buf_msg(skb);
+			msg_set_msgcnt(hdr, pktcnt);
+		}
+
 	tipc_link_xmit(tnl, &tnlq, xmitq);
 
 	if (mtyp == FAILOVER_MSG) {
diff --git a/net/tipc/msg.c b/net/tipc/msg.c
index f48e5857210f..e6d49cdc61b4 100644
--- a/net/tipc/msg.c
+++ b/net/tipc/msg.c
@@ -244,6 +244,65 @@ bool tipc_msg_validate(struct sk_buff **_skb)
 }
 
 /**
+ * tipc_msg_fragment - build a fragment skb list for TIPC message
+ *
+ * @skb: TIPC message skb
+ * @hdr: internal msg header to be put on the top of the fragments
+ * @pktmax: max size of a fragment incl. the header
+ * @frags: returned fragment skb list
+ *
+ * Returns 0 if the fragmentation is successful, otherwise: -EINVAL
+ * or -ENOMEM
+ */
+int tipc_msg_fragment(struct sk_buff *skb, const struct tipc_msg *hdr,
+		      int pktmax, struct sk_buff_head *frags)
+{
+	int pktno, nof_fragms, dsz, dmax, eat;
+	struct tipc_msg *_hdr;
+	struct sk_buff *_skb;
+	u8 *data;
+
+	/* Non-linear buffer? */
+	if (skb_linearize(skb))
+		return -ENOMEM;
+
+	data = (u8 *)skb->data;
+	dsz = msg_size(buf_msg(skb));
+	dmax = pktmax - INT_H_SIZE;
+	if (dsz <= dmax || !dmax)
+		return -EINVAL;
+
+	nof_fragms = dsz / dmax + 1;
+	for (pktno = 1; pktno <= nof_fragms; pktno++) {
+		if (pktno < nof_fragms)
+			eat = dmax;
+		else
+			eat = dsz % dmax;
+		/* Allocate a new fragment */
+		_skb = tipc_buf_acquire(INT_H_SIZE + eat, GFP_ATOMIC);
+		if (!_skb)
+			goto error;
+		skb_orphan(_skb);
+		__skb_queue_tail(frags, _skb);
+		/* Copy header & data to the fragment */
+		skb_copy_to_linear_data(_skb, hdr, INT_H_SIZE);
+		skb_copy_to_linear_data_offset(_skb, INT_H_SIZE, data, eat);
+		data += eat;
+		/* Update the fragment's header */
+		_hdr = buf_msg(_skb);
+		msg_set_fragm_no(_hdr, pktno);
+		msg_set_nof_fragms(_hdr, nof_fragms);
+		msg_set_size(_hdr, INT_H_SIZE + eat);
+	}
+	return 0;
+
+error:
+	__skb_queue_purge(frags);
+	__skb_queue_head_init(frags);
+	return -ENOMEM;
+}
+
+/**
  * tipc_msg_build - create buffer chain containing specified header and data
  * @mhdr: Message header, to be prepended to data
  * @m: User message
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index fca042cdff88..1c8c8dd32a4e 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -721,12 +721,26 @@ static inline void msg_set_last_bcast(struct tipc_msg *m, u32 n)
 	msg_set_bits(m, 4, 16, 0xffff, n);
 }
 
+static inline u32 msg_nof_fragms(struct tipc_msg *m)
+{
+	return msg_bits(m, 4, 0, 0xffff);
+}
+
+static inline void msg_set_nof_fragms(struct tipc_msg *m, u32 n)
+{
+	msg_set_bits(m, 4, 0, 0xffff, n);
+}
+
+static inline u32 msg_fragm_no(struct tipc_msg *m)
+{
+	return msg_bits(m, 4, 16, 0xffff);
+}
+
 static inline void msg_set_fragm_no(struct tipc_msg *m, u32 n)
 {
 	msg_set_bits(m, 4, 16, 0xffff, n);
 }
 
-
 static inline u16 msg_next_sent(struct tipc_msg *m)
 {
 	return msg_bits(m, 4, 0, 0xffff);
@@ -1045,6 +1059,8 @@ bool tipc_msg_bundle(struct sk_buff *skb, struct tipc_msg *msg, u32 mtu);
 bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg,
 			  u32 mtu, u32 dnode);
 bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos);
+int tipc_msg_fragment(struct sk_buff *skb, const struct tipc_msg *hdr,
+		      int pktmax, struct sk_buff_head *frags);
 int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m,
 		   int offset, int dsz, int mtu, struct sk_buff_head *list);
 bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err);
-- 
2.13.7





2003	Jan	Feb	Mar	Apr	May	Jun	Jul	Aug	Sep	Oct	Nov	Dec (6)
2004	Jan (9)	Feb (11)	Mar (22)	Apr (73)	May (78)	Jun (146)	Jul (80)	Aug (27)	Sep (5)	Oct (14)	Nov (18)	Dec (27)
2005	Jan (20)	Feb (30)	Mar (19)	Apr (28)	May (50)	Jun (31)	Jul (32)	Aug (14)	Sep (36)	Oct (43)	Nov (74)	Dec (63)
2006	Jan (34)	Feb (32)	Mar (21)	Apr (76)	May (106)	Jun (72)	Jul (70)	Aug (175)	Sep (130)	Oct (39)	Nov (81)	Dec (43)
2007	Jan (81)	Feb (36)	Mar (20)	Apr (43)	May (54)	Jun (34)	Jul (44)	Aug (55)	Sep (44)	Oct (54)	Nov (43)	Dec (41)
2008	Jan (42)	Feb (84)	Mar (73)	Apr (30)	May (119)	Jun (54)	Jul (54)	Aug (93)	Sep (173)	Oct (130)	Nov (145)	Dec (153)
2009	Jan (59)	Feb (12)	Mar (28)	Apr (18)	May (56)	Jun (9)	Jul (28)	Aug (62)	Sep (16)	Oct (19)	Nov (15)	Dec (17)
2010	Jan (14)	Feb (36)	Mar (37)	Apr (30)	May (33)	Jun (53)	Jul (42)	Aug (50)	Sep (67)	Oct (66)	Nov (69)	Dec (36)
2011	Jan (52)	Feb (45)	Mar (49)	Apr (21)	May (34)	Jun (13)	Jul (19)	Aug (37)	Sep (43)	Oct (10)	Nov (23)	Dec (30)
2012	Jan (42)	Feb (36)	Mar (46)	Apr (25)	May (96)	Jun (146)	Jul (40)	Aug (28)	Sep (61)	Oct (45)	Nov (100)	Dec (53)
2013	Jan (79)	Feb (24)	Mar (134)	Apr (156)	May (118)	Jun (75)	Jul (278)	Aug (145)	Sep (136)	Oct (168)	Nov (137)	Dec (439)
2014	Jan (284)	Feb (158)	Mar (231)	Apr (275)	May (259)	Jun (91)	Jul (222)	Aug (215)	Sep (165)	Oct (166)	Nov (211)	Dec (150)
2015	Jan (164)	Feb (324)	Mar (299)	Apr (214)	May (111)	Jun (109)	Jul (105)	Aug (36)	Sep (58)	Oct (131)	Nov (68)	Dec (30)
2016	Jan (46)	Feb (87)	Mar (135)	Apr (174)	May (132)	Jun (135)	Jul (149)	Aug (125)	Sep (79)	Oct (49)	Nov (95)	Dec (102)
2017	Jan (104)	Feb (75)	Mar (72)	Apr (53)	May (18)	Jun (5)	Jul (14)	Aug (19)	Sep (2)	Oct (13)	Nov (21)	Dec (67)
2018	Jan (56)	Feb (50)	Mar (148)	Apr (41)	May (37)	Jun (34)	Jul (34)	Aug (11)	Sep (52)	Oct (48)	Nov (28)	Dec (46)
2019	Jan (29)	Feb (63)	Mar (95)	Apr (54)	May (14)	Jun (71)	Jul (60)	Aug (49)	Sep (3)	Oct (64)	Nov (115)	Dec (57)
2020	Jan (15)	Feb (9)	Mar (38)	Apr (27)	May (60)	Jun (53)	Jul (35)	Aug (46)	Sep (37)	Oct (64)	Nov (20)	Dec (25)
2021	Jan (20)	Feb (31)	Mar (27)	Apr (23)	May (21)	Jun (30)	Jul (30)	Aug (7)	Sep (18)	Oct	Nov (15)	Dec (4)
2022	Jan (3)	Feb (1)	Mar (10)	Apr	May (2)	Jun (26)	Jul (5)	Aug	Sep (1)	Oct (2)	Nov (9)	Dec (2)
2023	Jan (4)	Feb (4)	Mar (5)	Apr (10)	May (29)	Jun (17)	Jul	Aug	Sep (1)	Oct (1)	Nov (2)	Dec
2024	Jan	Feb (6)	Mar	Apr (1)	May (6)	Jun	Jul (5)	Aug	Sep (3)	Oct	Nov	Dec
2025	Jan	Feb (3)	Mar	Apr	May	Jun	Jul (6)	Aug	Sep	Oct	Nov	Dec

tipc-discussion Mailing List for TIPC Cluster Domain Sockets (Page 47)

Cluster wide IPC providing datagram, connection, and bus messaging

tipc-discussion — User questions & problem reports, project news, developer discussion

tipc-discussion Mailing List for TIPC Cluster Domain Sockets (Page 47)

Cluster wide IPC providing datagram, connection, and bus messaging

tipc-discussion — User questions &amp; problem reports, project news, developer discussion

tipc-discussion — User questions & problem reports, project news, developer discussion