You can subscribe to this list here.
2003 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
(6) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2004 |
Jan
(9) |
Feb
(11) |
Mar
(22) |
Apr
(73) |
May
(78) |
Jun
(146) |
Jul
(80) |
Aug
(27) |
Sep
(5) |
Oct
(14) |
Nov
(18) |
Dec
(27) |
2005 |
Jan
(20) |
Feb
(30) |
Mar
(19) |
Apr
(28) |
May
(50) |
Jun
(31) |
Jul
(32) |
Aug
(14) |
Sep
(36) |
Oct
(43) |
Nov
(74) |
Dec
(63) |
2006 |
Jan
(34) |
Feb
(32) |
Mar
(21) |
Apr
(76) |
May
(106) |
Jun
(72) |
Jul
(70) |
Aug
(175) |
Sep
(130) |
Oct
(39) |
Nov
(81) |
Dec
(43) |
2007 |
Jan
(81) |
Feb
(36) |
Mar
(20) |
Apr
(43) |
May
(54) |
Jun
(34) |
Jul
(44) |
Aug
(55) |
Sep
(44) |
Oct
(54) |
Nov
(43) |
Dec
(41) |
2008 |
Jan
(42) |
Feb
(84) |
Mar
(73) |
Apr
(30) |
May
(119) |
Jun
(54) |
Jul
(54) |
Aug
(93) |
Sep
(173) |
Oct
(130) |
Nov
(145) |
Dec
(153) |
2009 |
Jan
(59) |
Feb
(12) |
Mar
(28) |
Apr
(18) |
May
(56) |
Jun
(9) |
Jul
(28) |
Aug
(62) |
Sep
(16) |
Oct
(19) |
Nov
(15) |
Dec
(17) |
2010 |
Jan
(14) |
Feb
(36) |
Mar
(37) |
Apr
(30) |
May
(33) |
Jun
(53) |
Jul
(42) |
Aug
(50) |
Sep
(67) |
Oct
(66) |
Nov
(69) |
Dec
(36) |
2011 |
Jan
(52) |
Feb
(45) |
Mar
(49) |
Apr
(21) |
May
(34) |
Jun
(13) |
Jul
(19) |
Aug
(37) |
Sep
(43) |
Oct
(10) |
Nov
(23) |
Dec
(30) |
2012 |
Jan
(42) |
Feb
(36) |
Mar
(46) |
Apr
(25) |
May
(96) |
Jun
(146) |
Jul
(40) |
Aug
(28) |
Sep
(61) |
Oct
(45) |
Nov
(100) |
Dec
(53) |
2013 |
Jan
(79) |
Feb
(24) |
Mar
(134) |
Apr
(156) |
May
(118) |
Jun
(75) |
Jul
(278) |
Aug
(145) |
Sep
(136) |
Oct
(168) |
Nov
(137) |
Dec
(439) |
2014 |
Jan
(284) |
Feb
(158) |
Mar
(231) |
Apr
(275) |
May
(259) |
Jun
(91) |
Jul
(222) |
Aug
(215) |
Sep
(165) |
Oct
(166) |
Nov
(211) |
Dec
(150) |
2015 |
Jan
(164) |
Feb
(324) |
Mar
(299) |
Apr
(214) |
May
(111) |
Jun
(109) |
Jul
(105) |
Aug
(36) |
Sep
(58) |
Oct
(131) |
Nov
(68) |
Dec
(30) |
2016 |
Jan
(46) |
Feb
(87) |
Mar
(135) |
Apr
(174) |
May
(132) |
Jun
(135) |
Jul
(149) |
Aug
(125) |
Sep
(79) |
Oct
(49) |
Nov
(95) |
Dec
(102) |
2017 |
Jan
(104) |
Feb
(75) |
Mar
(72) |
Apr
(53) |
May
(18) |
Jun
(5) |
Jul
(14) |
Aug
(19) |
Sep
(2) |
Oct
(13) |
Nov
(21) |
Dec
(67) |
2018 |
Jan
(56) |
Feb
(50) |
Mar
(148) |
Apr
(41) |
May
(37) |
Jun
(34) |
Jul
(34) |
Aug
(11) |
Sep
(52) |
Oct
(48) |
Nov
(28) |
Dec
(46) |
2019 |
Jan
(29) |
Feb
(63) |
Mar
(95) |
Apr
(54) |
May
(14) |
Jun
(71) |
Jul
(60) |
Aug
(49) |
Sep
(3) |
Oct
(64) |
Nov
(115) |
Dec
(57) |
2020 |
Jan
(15) |
Feb
(9) |
Mar
(38) |
Apr
(27) |
May
(60) |
Jun
(53) |
Jul
(35) |
Aug
(46) |
Sep
(37) |
Oct
(64) |
Nov
(20) |
Dec
(25) |
2021 |
Jan
(20) |
Feb
(31) |
Mar
(27) |
Apr
(23) |
May
(21) |
Jun
(30) |
Jul
(30) |
Aug
(7) |
Sep
(18) |
Oct
|
Nov
(15) |
Dec
(4) |
2022 |
Jan
(3) |
Feb
(1) |
Mar
(10) |
Apr
|
May
(2) |
Jun
(26) |
Jul
(5) |
Aug
|
Sep
(1) |
Oct
(2) |
Nov
(9) |
Dec
(2) |
2023 |
Jan
(4) |
Feb
(4) |
Mar
(5) |
Apr
(10) |
May
(29) |
Jun
(17) |
Jul
|
Aug
|
Sep
(1) |
Oct
(1) |
Nov
(2) |
Dec
|
2024 |
Jan
|
Feb
(6) |
Mar
|
Apr
(1) |
May
(6) |
Jun
|
Jul
(5) |
Aug
|
Sep
(3) |
Oct
|
Nov
|
Dec
|
2025 |
Jan
|
Feb
(3) |
Mar
|
Apr
|
May
|
Jun
|
Jul
(6) |
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
From: <joh...@de...> - 2019-12-19 05:05:06
|
From: John Rutherford <joh...@de...> To enable iproute2/tipc to generate backwards compatible printouts and validate command parameters for nodes using a <z.c.n> node address, it needs to be able to read the legacy address flag from the kernel. The legacy address flag records the way in which the node identity was originally specified. The legacy address flag is requested by the netlink message TIPC_NL_ADDR_LEGACY_GET. If the flag is set the attribute TIPC_NLA_NET_ADDR_LEGACY is set in the return message. Signed-off-by: John Rutherford <joh...@de...> Acked-by: Jon Maloy <jon...@er...> --- include/uapi/linux/tipc_netlink.h | 2 ++ net/tipc/net.c | 56 +++++++++++++++++++++++++++++++++++++++ net/tipc/net.h | 1 + net/tipc/netlink.c | 6 +++++ 4 files changed, 65 insertions(+) diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index 6c2194ab745b..dc0d23a50e69 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -65,6 +65,7 @@ enum { TIPC_NL_UDP_GET_REMOTEIP, TIPC_NL_KEY_SET, TIPC_NL_KEY_FLUSH, + TIPC_NL_ADDR_LEGACY_GET, __TIPC_NL_CMD_MAX, TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 @@ -176,6 +177,7 @@ enum { TIPC_NLA_NET_ADDR, /* u32 */ TIPC_NLA_NET_NODEID, /* u64 */ TIPC_NLA_NET_NODEID_W1, /* u64 */ + TIPC_NLA_NET_ADDR_LEGACY, /* flag */ __TIPC_NLA_NET_MAX, TIPC_NLA_NET_MAX = __TIPC_NLA_NET_MAX - 1 diff --git a/net/tipc/net.c b/net/tipc/net.c index 2de3cec9929d..85400e4242de 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -302,3 +302,59 @@ int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) return err; } + +static int __tipc_nl_addr_legacy_get(struct net *net, struct tipc_nl_msg *msg) +{ + struct tipc_net *tn = tipc_net(net); + struct nlattr *attrs; + void *hdr; + + hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, + 0, TIPC_NL_ADDR_LEGACY_GET); + if (!hdr) + return -EMSGSIZE; + + attrs = nla_nest_start(msg->skb, TIPC_NLA_NET); + if (!attrs) + goto msg_full; + + if (tn->legacy_addr_format) + if (nla_put_flag(msg->skb, TIPC_NLA_NET_ADDR_LEGACY)) + goto attr_msg_full; + + nla_nest_end(msg->skb, attrs); + genlmsg_end(msg->skb, hdr); + + return 0; + +attr_msg_full: + nla_nest_cancel(msg->skb, attrs); +msg_full: + genlmsg_cancel(msg->skb, hdr); + + return -EMSGSIZE; +} + +int tipc_nl_net_addr_legacy_get(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = sock_net(skb->sk); + struct tipc_nl_msg msg; + struct sk_buff *rep; + int err; + + rep = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!rep) + return -ENOMEM; + + msg.skb = rep; + msg.portid = info->snd_portid; + msg.seq = info->snd_seq; + + err = __tipc_nl_addr_legacy_get(net, &msg); + if (err) { + nlmsg_free(msg.skb); + return err; + } + + return genlmsg_reply(msg.skb, info); +} diff --git a/net/tipc/net.h b/net/tipc/net.h index b7f2e364eb99..6740d97c706e 100644 --- a/net/tipc/net.h +++ b/net/tipc/net.h @@ -47,5 +47,6 @@ void tipc_net_stop(struct net *net); int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info); int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_net_addr_legacy_get(struct sk_buff *skb, struct genl_info *info); #endif diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index e53231bd23b4..7c35094c20b8 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -83,6 +83,7 @@ const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = { [TIPC_NLA_NET_ADDR] = { .type = NLA_U32 }, [TIPC_NLA_NET_NODEID] = { .type = NLA_U64 }, [TIPC_NLA_NET_NODEID_W1] = { .type = NLA_U64 }, + [TIPC_NLA_NET_ADDR_LEGACY] = { .type = NLA_FLAG } }; const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = { @@ -273,6 +274,11 @@ static const struct genl_ops tipc_genl_v2_ops[] = { .doit = tipc_nl_node_flush_key, }, #endif + { + .cmd = TIPC_NL_ADDR_LEGACY_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = tipc_nl_net_addr_legacy_get, + }, }; struct genl_family tipc_genl_family __ro_after_init = { -- 2.1.4 |
From: David M. <da...@da...> - 2019-12-17 22:17:30
|
From: Jon Maloy <jon...@er...> Date: Mon, 16 Dec 2019 19:21:02 +0100 > In the commit referred to below we eliminated sending of the 'gap' > indicator in regular ACK messages, reserving this to explicit NACK > ditto. > > Unfortunately we missed to also eliminate building of the 'gap block' > area in ACK messages. This area is meant to report gaps in the > received packet sequence following the initial gap, so that lost > packets can be retransmitted earlier and received out-of-sequence > packets can be released earlier. However, the interpretation of those > blocks is dependent on a complete and correct sequence of gaps and > acks. Hence, when the initial gap indicator is missing a single gap > block will be interpreted as an acknowledgment of all preceding > packets. This may lead to packets being released prematurely from the > sender's transmit queue, with easily predicatble consequences. > > We now fix this by not building any gap block area if there is no > initial gap to report. > > Fixes: commit 02288248b051 ("tipc: eliminate gap indicator from ACK messages") > Signed-off-by: Jon Maloy <jon...@er...> Applied, thanks Jon. |
From: Ying X. <yin...@wi...> - 2019-12-12 06:28:03
|
On 12/12/19 2:46 AM, Paul E. McKenney wrote: > On Wed, Dec 11, 2019 at 12:42:00PM +0800, Ying Xue wrote: >> On 12/11/19 10:00 AM, Tuong Lien Tong wrote: >>>> >>>> /* Move passive key if any */ >>>> if (key.passive) { >>>> - tipc_aead_rcu_swap(rx->aead[key.passive], tmp2, &rx->lock); >>>> + tmp2 = rcu_replace_pointer(rx->aead[key.passive], tmp2, >>> &rx->lock); >>> The 3rd parameter should be the lockdep condition checking instead of the >>> spinlock's pointer i.e. "lockdep_is_held(&rx->lock)"? >>> That's why I'd prefer to use the 'tipc_aead_rcu_swap ()' macro, which is >>> clear & concise at least for the context here. It might be re-used later as >>> well... >>> >> >> Right. The 3rd parameter of rcu_replace_pointer() should be >> "lockdep_is_held(&rx->lock)" instead of "&rx->lock". > > Like this? Yes, I think it's better to set the 3rd parameter of rcu_replace_pointer() with "lockdep_is_held(&rx->lock)". > > Thanx, Paul > > ------------------------------------------------------------------------ > > commit 575bb4ba1b22383656760feb3d122e11656ccdfd > Author: Paul E. McKenney <pa...@ke...> > Date: Mon Dec 9 19:13:45 2019 -0800 > > net/tipc: Replace rcu_swap_protected() with rcu_replace_pointer() > > This commit replaces the use of rcu_swap_protected() with the more > intuitively appealing rcu_replace_pointer() as a step towards removing > rcu_swap_protected(). > > Link: https://lore.kernel.org/lkml/CAHk-=wiAsJLw1egFEE=Z7-...@ma.../ > Reported-by: Linus Torvalds <tor...@li...> > Reported-by: kbuild test robot <lk...@in...> > Signed-off-by: Paul E. McKenney <pa...@ke...> > [ paulmck: Updated based on Ying Xue and Tuong Lien Tong feedback. ] > Cc: Jon Maloy <jon...@er...> > Cc: Ying Xue <yin...@wi...> > Cc: "David S. Miller" <da...@da...> > Cc: <ne...@vg...> > Cc: <tip...@li...> > > diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c > index 990a872..c8c47fc 100644 > --- a/net/tipc/crypto.c > +++ b/net/tipc/crypto.c > @@ -257,9 +257,6 @@ static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, > #define tipc_aead_rcu_ptr(rcu_ptr, lock) \ > rcu_dereference_protected((rcu_ptr), lockdep_is_held(lock)) > > -#define tipc_aead_rcu_swap(rcu_ptr, ptr, lock) \ > - rcu_swap_protected((rcu_ptr), (ptr), lockdep_is_held(lock)) > - > #define tipc_aead_rcu_replace(rcu_ptr, ptr, lock) \ > do { \ > typeof(rcu_ptr) __tmp = rcu_dereference_protected((rcu_ptr), \ > @@ -1189,7 +1186,7 @@ static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending) > > /* Move passive key if any */ > if (key.passive) { > - tipc_aead_rcu_swap(rx->aead[key.passive], tmp2, &rx->lock); > + tmp2 = rcu_replace_pointer(rx->aead[key.passive], tmp2, lockdep_is_held(&rx->lock)); > x = (key.passive - key.pending + new_pending) % KEY_MAX; > new_passive = (x <= 0) ? x + KEY_MAX : x; > } > |
From: Ying X. <yin...@wi...> - 2019-12-11 04:55:36
|
On 12/11/19 10:00 AM, Tuong Lien Tong wrote: >> >> /* Move passive key if any */ >> if (key.passive) { >> - tipc_aead_rcu_swap(rx->aead[key.passive], tmp2, &rx->lock); >> + tmp2 = rcu_replace_pointer(rx->aead[key.passive], tmp2, > &rx->lock); > The 3rd parameter should be the lockdep condition checking instead of the > spinlock's pointer i.e. "lockdep_is_held(&rx->lock)"? > That's why I'd prefer to use the 'tipc_aead_rcu_swap ()' macro, which is > clear & concise at least for the context here. It might be re-used later as > well... > Right. The 3rd parameter of rcu_replace_pointer() should be "lockdep_is_held(&rx->lock)" instead of "&rx->lock". |
From: Ying X. <yin...@wi...> - 2019-12-11 04:49:07
|
On 12/11/19 11:17 AM, Paul E. McKenney wrote: >> Acked-by: Ying Xue <yin...@wi...> > As in the following? If so, I will be very happy to apply your Acked-by. Yes. Thanks! |
From: Tuong L. T. <tuo...@de...> - 2019-12-11 02:00:57
|
Hi Ying, Paul, Please see my comments inline. Thanks! BR/Tuong -----Original Message----- From: Ying Xue <yin...@wi...> Sent: Wednesday, December 11, 2019 8:32 AM To: pa...@ke... Cc: ne...@vg...; lin...@vg...; mi...@ke...; tip...@li...; ker...@fb...; tor...@li...; da...@da... Subject: Re: [tipc-discussion] [PATCH net/tipc] Replace rcu_swap_protected() with rcu_replace_pointer() On 12/11/19 6:38 AM, Paul E. McKenney wrote: > commit 4ee8e2c68b076867b7a5af82a38010fffcab611c > Author: Paul E. McKenney <pa...@ke...> > Date: Mon Dec 9 19:13:45 2019 -0800 > > net/tipc: Replace rcu_swap_protected() with rcu_replace_pointer() > > This commit replaces the use of rcu_swap_protected() with the more > intuitively appealing rcu_replace_pointer() as a step towards removing > rcu_swap_protected(). > > Link: https://lore.kernel.org/lkml/CAHk-=wiAsJLw1egFEE=Z7-GGtM6wcvtyytXZA1+BHqta4g g6...@ma.../ > Reported-by: Linus Torvalds <tor...@li...> > Reported-by: kbuild test robot <lk...@in...> > Signed-off-by: Paul E. McKenney <pa...@ke...> > Cc: Jon Maloy <jon...@er...> > Cc: Ying Xue <yin...@wi...> > Cc: "David S. Miller" <da...@da...> > Cc: <ne...@vg...> > Cc: <tip...@li...> > Acked-by: Ying Xue <yin...@wi...> > diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c > index 990a872..978d2db 100644 > --- a/net/tipc/crypto.c > +++ b/net/tipc/crypto.c > @@ -257,9 +257,6 @@ static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, > #define tipc_aead_rcu_ptr(rcu_ptr, lock) \ > rcu_dereference_protected((rcu_ptr), lockdep_is_held(lock)) > > -#define tipc_aead_rcu_swap(rcu_ptr, ptr, lock) \ > - rcu_swap_protected((rcu_ptr), (ptr), lockdep_is_held(lock)) > - > #define tipc_aead_rcu_replace(rcu_ptr, ptr, lock) \ > do { \ > typeof(rcu_ptr) __tmp = rcu_dereference_protected((rcu_ptr), \ > @@ -1189,7 +1186,7 @@ static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending) > > /* Move passive key if any */ > if (key.passive) { > - tipc_aead_rcu_swap(rx->aead[key.passive], tmp2, &rx->lock); > + tmp2 = rcu_replace_pointer(rx->aead[key.passive], tmp2, &rx->lock); The 3rd parameter should be the lockdep condition checking instead of the spinlock's pointer i.e. "lockdep_is_held(&rx->lock)"? That's why I'd prefer to use the 'tipc_aead_rcu_swap ()' macro, which is clear & concise at least for the context here. It might be re-used later as well... > x = (key.passive - key.pending + new_pending) % KEY_MAX; > new_passive = (x <= 0) ? x + KEY_MAX : x; > } > _______________________________________________ tipc-discussion mailing list tip...@li... https://lists.sourceforge.net/lists/listinfo/tipc-discussion |
From: Ying X. <yin...@wi...> - 2019-12-11 01:45:40
|
On 12/11/19 6:38 AM, Paul E. McKenney wrote: > commit 4ee8e2c68b076867b7a5af82a38010fffcab611c > Author: Paul E. McKenney <pa...@ke...> > Date: Mon Dec 9 19:13:45 2019 -0800 > > net/tipc: Replace rcu_swap_protected() with rcu_replace_pointer() > > This commit replaces the use of rcu_swap_protected() with the more > intuitively appealing rcu_replace_pointer() as a step towards removing > rcu_swap_protected(). > > Link: https://lore.kernel.org/lkml/CAHk-=wiAsJLw1egFEE=Z7-...@ma.../ > Reported-by: Linus Torvalds <tor...@li...> > Reported-by: kbuild test robot <lk...@in...> > Signed-off-by: Paul E. McKenney <pa...@ke...> > Cc: Jon Maloy <jon...@er...> > Cc: Ying Xue <yin...@wi...> > Cc: "David S. Miller" <da...@da...> > Cc: <ne...@vg...> > Cc: <tip...@li...> > Acked-by: Ying Xue <yin...@wi...> > diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c > index 990a872..978d2db 100644 > --- a/net/tipc/crypto.c > +++ b/net/tipc/crypto.c > @@ -257,9 +257,6 @@ static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, > #define tipc_aead_rcu_ptr(rcu_ptr, lock) \ > rcu_dereference_protected((rcu_ptr), lockdep_is_held(lock)) > > -#define tipc_aead_rcu_swap(rcu_ptr, ptr, lock) \ > - rcu_swap_protected((rcu_ptr), (ptr), lockdep_is_held(lock)) > - > #define tipc_aead_rcu_replace(rcu_ptr, ptr, lock) \ > do { \ > typeof(rcu_ptr) __tmp = rcu_dereference_protected((rcu_ptr), \ > @@ -1189,7 +1186,7 @@ static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending) > > /* Move passive key if any */ > if (key.passive) { > - tipc_aead_rcu_swap(rx->aead[key.passive], tmp2, &rx->lock); > + tmp2 = rcu_replace_pointer(rx->aead[key.passive], tmp2, &rx->lock); > x = (key.passive - key.pending + new_pending) % KEY_MAX; > new_passive = (x <= 0) ? x + KEY_MAX : x; > } > |
From: David M. <da...@da...> - 2019-12-11 01:45:30
|
From: Tuong Lien <tuo...@de...> Date: Tue, 10 Dec 2019 15:21:01 +0700 > This series consists of some bug-fixes for TIPC. Series applied, thanks. |
From: David M. <da...@da...> - 2019-12-11 01:31:43
|
From: Jon Maloy <jon...@er...> Date: Tue, 10 Dec 2019 00:52:43 +0100 > We improve thoughput greatly by introducing a variety of the Reno > congestion control algorithm at the link level. Series applied, thanks Jon. |
From: Ying X. <yin...@wi...> - 2019-12-10 14:50:36
|
On 12/10/19 11:31 AM, Paul E. McKenney wrote: > This commit replaces the use of rcu_swap_protected() with the more > intuitively appealing rcu_replace_pointer() as a step towards removing > rcu_swap_protected(). > > Link: https://lore.kernel.org/lkml/CAHk-=wiAsJLw1egFEE=Z7-...@ma.../ > Reported-by: Linus Torvalds <tor...@li...> > Reported-by: kbuild test robot <lk...@in...> > Signed-off-by: Paul E. McKenney <pa...@ke...> > Cc: Jon Maloy <jon...@er...> > Cc: Ying Xue <yin...@wi...> > Cc: "David S. Miller" <da...@da...> > Cc: <ne...@vg...> > Cc: <tip...@li...> > > diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c > index 990a872..64cf831 100644 > --- a/net/tipc/crypto.c > +++ b/net/tipc/crypto.c > @@ -258,7 +258,7 @@ static char *tipc_key_change_dump(struct tipc_key old, struct tipc_key new, > rcu_dereference_protected((rcu_ptr), lockdep_is_held(lock)) > > #define tipc_aead_rcu_swap(rcu_ptr, ptr, lock) \ > - rcu_swap_protected((rcu_ptr), (ptr), lockdep_is_held(lock)) > + rcu_replace_pointer((rcu_ptr), (ptr), lockdep_is_held(lock)) (ptr) = rcu_replace_pointer((rcu_ptr), (ptr), lockdep_is_held(lock)) > > #define tipc_aead_rcu_replace(rcu_ptr, ptr, lock) \ > do { \ > @@ -1189,7 +1189,7 @@ static bool tipc_crypto_key_try_align(struct tipc_crypto *rx, u8 new_pending) > > /* Move passive key if any */ > if (key.passive) { > - tipc_aead_rcu_swap(rx->aead[key.passive], tmp2, &rx->lock); > + tmp2 = rcu_replace_pointer(rx->aead[key.passive], tmp2, &rx->lock); tipc_aead_rcu_swap() is only called here in TIPC module. If we use rcu_replace_pointer() to switch pointers instead of calling tipc_aead_rcu_swap() macro, I think we should completely remove tipc_aead_rcu_swap(). > x = (key.passive - key.pending + new_pending) % KEY_MAX; > new_passive = (x <= 0) ? x + KEY_MAX : x; > } > |
From: Ying X. <yin...@wi...> - 2019-12-10 14:26:09
|
On 12/9/19 6:11 PM, Tuong Lien wrote: > In the function 'tipc_disc_rcv()', the 'msg_peer_net_hash()' is called > to read the header data field but after the message skb has been freed, > that might result in a garbage value... > > This commit fixes it by defining a new local variable to store the data > first, just like the other header fields' handling. > > Fixes: f73b12812a3d ("tipc: improve throughput between nodes in netns") > Signed-off-by: Tuong Lien <tuo...@de...> Acked-by: Ying Xue <yin...@wi...> > --- > net/tipc/discover.c | 6 +++--- > 1 file changed, 3 insertions(+), 3 deletions(-) > > diff --git a/net/tipc/discover.c b/net/tipc/discover.c > index b043e8c6397a..bfe43da127c0 100644 > --- a/net/tipc/discover.c > +++ b/net/tipc/discover.c > @@ -194,6 +194,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, > { > struct tipc_net *tn = tipc_net(net); > struct tipc_msg *hdr = buf_msg(skb); > + u32 pnet_hash = msg_peer_net_hash(hdr); > u16 caps = msg_node_capabilities(hdr); > bool legacy = tn->legacy_addr_format; > u32 sugg = msg_sugg_node_addr(hdr); > @@ -242,9 +243,8 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, > return; > if (!tipc_in_scope(legacy, b->domain, src)) > return; > - tipc_node_check_dest(net, src, peer_id, b, caps, signature, > - msg_peer_net_hash(hdr), &maddr, &respond, > - &dupl_addr); > + tipc_node_check_dest(net, src, peer_id, b, caps, signature, pnet_hash, > + &maddr, &respond, &dupl_addr); > if (dupl_addr) > disc_dupl_alert(b, src, &maddr); > if (!respond) > |
From: Tuong L. <tuo...@de...> - 2019-12-10 08:21:26
|
When a user message is sent, TIPC will check if the socket has faced a congestion at link layer. If that happens, it will make a sleep to wait for the congestion to disappear. This leaves a gap for other users to take over the socket (e.g. multi threads) since the socket is released as well. Also, in case of connectionless (e.g. SOCK_RDM), user is free to send messages to various destinations (e.g. via 'sendto()'), then the socket's preformatted header has to be updated correspondingly prior to the actual payload message building. Unfortunately, the latter action is done before the first action which causes a condition issue that the destination of a certain message can be modified incorrectly in the middle, leading to wrong destination when that message is built. Consequently, when the message is sent to the link layer, it gets stuck there forever because the peer node will simply reject it. After a number of retransmission attempts, the link is eventually taken down and the retransmission failure is reported. This commit fixes the problem by rearranging the order of actions to prevent the race condition from occurring, so the message building is 'atomic' and its header will not be modified by anyone. Fixes: 365ad353c256 ("tipc: reduce risk of user starvation during link congestion") Acked-by: Jon Maloy <jon...@er...> Signed-off-by: Tuong Lien <tuo...@de...> --- net/tipc/socket.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 41688da233ab..6552f986774c 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1364,8 +1364,8 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) struct tipc_msg *hdr = &tsk->phdr; struct tipc_name_seq *seq; struct sk_buff_head pkts; - u32 dport, dnode = 0; - u32 type, inst; + u32 dport = 0, dnode = 0; + u32 type = 0, inst = 0; int mtu, rc; if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE)) @@ -1418,23 +1418,11 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) type = dest->addr.name.name.type; inst = dest->addr.name.name.instance; dnode = dest->addr.name.domain; - msg_set_type(hdr, TIPC_NAMED_MSG); - msg_set_hdr_sz(hdr, NAMED_H_SIZE); - msg_set_nametype(hdr, type); - msg_set_nameinst(hdr, inst); - msg_set_lookup_scope(hdr, tipc_node2scope(dnode)); dport = tipc_nametbl_translate(net, type, inst, &dnode); - msg_set_destnode(hdr, dnode); - msg_set_destport(hdr, dport); if (unlikely(!dport && !dnode)) return -EHOSTUNREACH; } else if (dest->addrtype == TIPC_ADDR_ID) { dnode = dest->addr.id.node; - msg_set_type(hdr, TIPC_DIRECT_MSG); - msg_set_lookup_scope(hdr, 0); - msg_set_destnode(hdr, dnode); - msg_set_destport(hdr, dest->addr.id.ref); - msg_set_hdr_sz(hdr, BASIC_H_SIZE); } else { return -EINVAL; } @@ -1445,6 +1433,22 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) if (unlikely(rc)) return rc; + if (dest->addrtype == TIPC_ADDR_NAME) { + msg_set_type(hdr, TIPC_NAMED_MSG); + msg_set_hdr_sz(hdr, NAMED_H_SIZE); + msg_set_nametype(hdr, type); + msg_set_nameinst(hdr, inst); + msg_set_lookup_scope(hdr, tipc_node2scope(dnode)); + msg_set_destnode(hdr, dnode); + msg_set_destport(hdr, dport); + } else { /* TIPC_ADDR_ID */ + msg_set_type(hdr, TIPC_DIRECT_MSG); + msg_set_lookup_scope(hdr, 0); + msg_set_destnode(hdr, dnode); + msg_set_destport(hdr, dest->addr.id.ref); + msg_set_hdr_sz(hdr, BASIC_H_SIZE); + } + __skb_queue_head_init(&pkts); mtu = tipc_node_get_mtu(net, dnode, tsk->portid, false); rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); -- 2.13.7 |
From: Tuong L. <tuo...@de...> - 2019-12-10 08:21:26
|
In the function 'tipc_disc_rcv()', the 'msg_peer_net_hash()' is called to read the header data field but after the message skb has been freed, that might result in a garbage value... This commit fixes it by defining a new local variable to store the data first, just like the other header fields' handling. Fixes: f73b12812a3d ("tipc: improve throughput between nodes in netns") Acked-by: Jon Maloy <jon...@er...> Signed-off-by: Tuong Lien <tuo...@de...> --- net/tipc/discover.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/tipc/discover.c b/net/tipc/discover.c index b043e8c6397a..bfe43da127c0 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -194,6 +194,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, { struct tipc_net *tn = tipc_net(net); struct tipc_msg *hdr = buf_msg(skb); + u32 pnet_hash = msg_peer_net_hash(hdr); u16 caps = msg_node_capabilities(hdr); bool legacy = tn->legacy_addr_format; u32 sugg = msg_sugg_node_addr(hdr); @@ -242,9 +243,8 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, return; if (!tipc_in_scope(legacy, b->domain, src)) return; - tipc_node_check_dest(net, src, peer_id, b, caps, signature, - msg_peer_net_hash(hdr), &maddr, &respond, - &dupl_addr); + tipc_node_check_dest(net, src, peer_id, b, caps, signature, pnet_hash, + &maddr, &respond, &dupl_addr); if (dupl_addr) disc_dupl_alert(b, src, &maddr); if (!respond) -- 2.13.7 |
From: Tuong L. <tuo...@de...> - 2019-12-10 08:21:26
|
In commit c55c8edafa91 ("tipc: smooth change between replicast and broadcast"), we allow instant switching between replicast and broadcast by sending a dummy 'SYN' packet on the last used link to synchronize packets on the links. The 'SYN' message is an object of link congestion also, so if that happens, a 'SOCK_WAKEUP' will be scheduled to be sent back to the socket... However, in that commit, we simply use the same socket 'cong_link_cnt' counter for both the 'SYN' & normal payload message sending. Therefore, if both the replicast & broadcast links are congested, the counter will be not updated correctly but overwritten by the latter congestion. Later on, when the 'SOCK_WAKEUP' messages are processed, the counter is reduced one by one and eventually overflowed. Consequently, further activities on the socket will only wait for the false congestion signal to disappear but never been met. Because sending the 'SYN' message is vital for the mechanism, it should be done anyway. This commit fixes the issue by marking the message with an error code e.g. 'TIPC_ERR_NO_PORT', so its sending should not face a link congestion, there is no need to touch the socket 'cong_link_cnt' either. In addition, in the event of any error (e.g. -ENOBUFS), we will purge the entire payload message queue and make a return immediately. Fixes: c55c8edafa91 ("tipc: smooth change between replicast and broadcast") Acked-by: Jon Maloy <jon...@er...> Signed-off-by: Tuong Lien <tuo...@de...> --- net/tipc/bcast.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 55aeba681cf4..656ebc79c64e 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -305,17 +305,17 @@ static int tipc_rcast_xmit(struct net *net, struct sk_buff_head *pkts, * @skb: socket buffer to copy * @method: send method to be used * @dests: destination nodes for message. - * @cong_link_cnt: returns number of encountered congested destination links * Returns 0 if success, otherwise errno */ static int tipc_mcast_send_sync(struct net *net, struct sk_buff *skb, struct tipc_mc_method *method, - struct tipc_nlist *dests, - u16 *cong_link_cnt) + struct tipc_nlist *dests) { struct tipc_msg *hdr, *_hdr; struct sk_buff_head tmpq; struct sk_buff *_skb; + u16 cong_link_cnt; + int rc = 0; /* Is a cluster supporting with new capabilities ? */ if (!(tipc_net(net)->capabilities & TIPC_MCAST_RBCTL)) @@ -343,18 +343,19 @@ static int tipc_mcast_send_sync(struct net *net, struct sk_buff *skb, _hdr = buf_msg(_skb); msg_set_size(_hdr, MCAST_H_SIZE); msg_set_is_rcast(_hdr, !msg_is_rcast(hdr)); + msg_set_errcode(_hdr, TIPC_ERR_NO_PORT); __skb_queue_head_init(&tmpq); __skb_queue_tail(&tmpq, _skb); if (method->rcast) - tipc_bcast_xmit(net, &tmpq, cong_link_cnt); + rc = tipc_bcast_xmit(net, &tmpq, &cong_link_cnt); else - tipc_rcast_xmit(net, &tmpq, dests, cong_link_cnt); + rc = tipc_rcast_xmit(net, &tmpq, dests, &cong_link_cnt); /* This queue should normally be empty by now */ __skb_queue_purge(&tmpq); - return 0; + return rc; } /* tipc_mcast_xmit - deliver message to indicated destination nodes @@ -396,9 +397,14 @@ int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts, msg_set_is_rcast(hdr, method->rcast); /* Switch method ? */ - if (rcast != method->rcast) - tipc_mcast_send_sync(net, skb, method, - dests, cong_link_cnt); + if (rcast != method->rcast) { + rc = tipc_mcast_send_sync(net, skb, method, dests); + if (unlikely(rc)) { + pr_err("Unable to send SYN: method %d, rc %d\n", + rcast, rc); + goto exit; + } + } if (method->rcast) rc = tipc_rcast_xmit(net, pkts, dests, cong_link_cnt); -- 2.13.7 |
From: Tuong L. <tuo...@de...> - 2019-12-10 08:21:25
|
The current rbtree for service ranges in the name table is built based on the 'lower' & 'upper' range values resulting in a flaw in the rbtree searching. Some issues have been observed in case of range overlapping: Case #1: unable to withdraw a name entry: After some name services are bound, all of them are withdrawn by user but one remains in the name table forever. This corrupts the table and that service becomes dummy i.e. no real port. E.g. / {22, 22} / / ---> {10, 50} / \ / \ {10, 30} {20, 60} The node {10, 30} cannot be removed since the rbtree searching stops at the node's ancestor i.e. {10, 50}, so starting from it will never reach the finding node. Case #2: failed to send data in some cases: E.g. Two service ranges: {20, 60}, {10, 50} are bound. The rbtree for this service will be one of the two cases below depending on the order of the bindings: {20, 60} {10, 50} <-- / \ / \ / \ / \ {10, 50} NIL <-- NIL {20, 60} (a) (b) Now, try to send some data to service {30}, there will be two results: (a): Failed, no route to host. (b): Ok. The reason is that the rbtree searching will stop at the pointing node as shown above. Case #3: Same as case #2b above but if the data sending's scope is local and the {10, 50} is published by a peer node, then it will result in 'no route to host' even though the other {20, 60} is for example on the local node which should be able to get the data. The issues are actually due to the way we built the rbtree. This commit fixes it by introducing an additional field to each node - named 'max', which is the largest 'upper' of that node subtree. The 'max' value for each subtrees will be propagated correctly whenever a node is inserted/ removed or the tree is rebalanced by the augmented rbtree callbacks. By this way, we can change the rbtree searching appoarch to solve the issues above. Another benefit from this is that we can now improve the searching for a next range matching e.g. in case of multicast, so get rid of the unneeded looping over all nodes in the tree. Acked-by: Jon Maloy <jon...@er...> Signed-off-by: Tuong Lien <tuo...@de...> --- net/tipc/name_table.c | 279 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 179 insertions(+), 100 deletions(-) diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 92d04dc2a44b..359b2bc888cf 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -36,6 +36,7 @@ #include <net/sock.h> #include <linux/list_sort.h> +#include <linux/rbtree_augmented.h> #include "core.h" #include "netlink.h" #include "name_table.h" @@ -51,6 +52,7 @@ * @lower: service range lower bound * @upper: service range upper bound * @tree_node: member of service range RB tree + * @max: largest 'upper' in this node subtree * @local_publ: list of identical publications made from this node * Used by closest_first lookup and multicast lookup algorithm * @all_publ: all publications identical to this one, whatever node and scope @@ -60,6 +62,7 @@ struct service_range { u32 lower; u32 upper; struct rb_node tree_node; + u32 max; struct list_head local_publ; struct list_head all_publ; }; @@ -84,6 +87,130 @@ struct tipc_service { struct rcu_head rcu; }; +#define service_range_upper(sr) ((sr)->upper) +RB_DECLARE_CALLBACKS_MAX(static, sr_callbacks, + struct service_range, tree_node, u32, max, + service_range_upper) + +#define service_range_entry(rbtree_node) \ + (container_of(rbtree_node, struct service_range, tree_node)) + +#define service_range_overlap(sr, start, end) \ + ((sr)->lower <= (end) && (sr)->upper >= (start)) + +/** + * service_range_foreach_match - iterate over tipc service rbtree for each + * range match + * @sr: the service range pointer as a loop cursor + * @sc: the pointer to tipc service which holds the service range rbtree + * @start, end: the range (end >= start) for matching + */ +#define service_range_foreach_match(sr, sc, start, end) \ + for (sr = service_range_match_first((sc)->ranges.rb_node, \ + start, \ + end); \ + sr; \ + sr = service_range_match_next(&(sr)->tree_node, \ + start, \ + end)) + +/** + * service_range_match_first - find first service range matching a range + * @n: the root node of service range rbtree for searching + * @start, end: the range (end >= start) for matching + * + * Return: the leftmost service range node in the rbtree that overlaps the + * specific range if any. Otherwise, returns NULL. + */ +static struct service_range *service_range_match_first(struct rb_node *n, + u32 start, u32 end) +{ + struct service_range *sr; + struct rb_node *l, *r; + + /* Non overlaps in tree at all? */ + if (!n || service_range_entry(n)->max < start) + return NULL; + + while (n) { + l = n->rb_left; + if (l && service_range_entry(l)->max >= start) { + /* A leftmost overlap range node must be one in the left + * subtree. If not, it has lower > end, then nodes on + * the right side cannot satisfy the condition either. + */ + n = l; + continue; + } + + /* No one in the left subtree can match, return if this node is + * an overlap i.e. leftmost. + */ + sr = service_range_entry(n); + if (service_range_overlap(sr, start, end)) + return sr; + + /* Ok, try to lookup on the right side */ + r = n->rb_right; + if (sr->lower <= end && + r && service_range_entry(r)->max >= start) { + n = r; + continue; + } + break; + } + + return NULL; +} + +/** + * service_range_match_next - find next service range matching a range + * @n: a node in service range rbtree from which the searching starts + * @start, end: the range (end >= start) for matching + * + * Return: the next service range node to the given node in the rbtree that + * overlaps the specific range if any. Otherwise, returns NULL. + */ +static struct service_range *service_range_match_next(struct rb_node *n, + u32 start, u32 end) +{ + struct service_range *sr; + struct rb_node *p, *r; + + while (n) { + r = n->rb_right; + if (r && service_range_entry(r)->max >= start) + /* A next overlap range node must be one in the right + * subtree. If not, it has lower > end, then any next + * successor (- an ancestor) of this node cannot + * satisfy the condition either. + */ + return service_range_match_first(r, start, end); + + /* No one in the right subtree can match, go up to find an + * ancestor of this node which is parent of a left-hand child. + */ + while ((p = rb_parent(n)) && n == p->rb_right) + n = p; + if (!p) + break; + + /* Return if this ancestor is an overlap */ + sr = service_range_entry(p); + if (service_range_overlap(sr, start, end)) + return sr; + + /* Ok, try to lookup more from this ancestor */ + if (sr->lower <= end) { + n = p; + continue; + } + break; + } + + return NULL; +} + static int hash(int x) { return x & (TIPC_NAMETBL_SIZE - 1); @@ -139,84 +266,51 @@ static struct tipc_service *tipc_service_create(u32 type, struct hlist_head *hd) return service; } -/** - * tipc_service_first_range - find first service range in tree matching instance - * - * Very time-critical, so binary search through range rb tree - */ -static struct service_range *tipc_service_first_range(struct tipc_service *sc, - u32 instance) -{ - struct rb_node *n = sc->ranges.rb_node; - struct service_range *sr; - - while (n) { - sr = container_of(n, struct service_range, tree_node); - if (sr->lower > instance) - n = n->rb_left; - else if (sr->upper < instance) - n = n->rb_right; - else - return sr; - } - return NULL; -} - /* tipc_service_find_range - find service range matching publication parameters */ static struct service_range *tipc_service_find_range(struct tipc_service *sc, u32 lower, u32 upper) { - struct rb_node *n = sc->ranges.rb_node; struct service_range *sr; - sr = tipc_service_first_range(sc, lower); - if (!sr) - return NULL; - - /* Look for exact match */ - for (n = &sr->tree_node; n; n = rb_next(n)) { - sr = container_of(n, struct service_range, tree_node); - if (sr->upper == upper) - break; + service_range_foreach_match(sr, sc, lower, upper) { + /* Look for exact match */ + if (sr->lower == lower && sr->upper == upper) + return sr; } - if (!n || sr->lower != lower || sr->upper != upper) - return NULL; - return sr; + return NULL; } static struct service_range *tipc_service_create_range(struct tipc_service *sc, u32 lower, u32 upper) { struct rb_node **n, *parent = NULL; - struct service_range *sr, *tmp; + struct service_range *sr; n = &sc->ranges.rb_node; while (*n) { - tmp = container_of(*n, struct service_range, tree_node); parent = *n; - tmp = container_of(parent, struct service_range, tree_node); - if (lower < tmp->lower) - n = &(*n)->rb_left; - else if (lower > tmp->lower) - n = &(*n)->rb_right; - else if (upper < tmp->upper) - n = &(*n)->rb_left; - else if (upper > tmp->upper) - n = &(*n)->rb_right; + sr = service_range_entry(parent); + if (lower == sr->lower && upper == sr->upper) + return sr; + if (sr->max < upper) + sr->max = upper; + if (lower <= sr->lower) + n = &parent->rb_left; else - return tmp; + n = &parent->rb_right; } sr = kzalloc(sizeof(*sr), GFP_ATOMIC); if (!sr) return NULL; sr->lower = lower; sr->upper = upper; + sr->max = upper; INIT_LIST_HEAD(&sr->local_publ); INIT_LIST_HEAD(&sr->all_publ); rb_link_node(&sr->tree_node, parent, n); - rb_insert_color(&sr->tree_node, &sc->ranges); + rb_insert_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks); return sr; } @@ -310,7 +404,6 @@ static void tipc_service_subscribe(struct tipc_service *service, struct list_head publ_list; struct service_range *sr; struct tipc_name_seq ns; - struct rb_node *n; u32 filter; ns.type = tipc_sub_read(sb, seq.type); @@ -325,13 +418,7 @@ static void tipc_service_subscribe(struct tipc_service *service, return; INIT_LIST_HEAD(&publ_list); - for (n = rb_first(&service->ranges); n; n = rb_next(n)) { - sr = container_of(n, struct service_range, tree_node); - if (sr->lower > ns.upper) - break; - if (!tipc_sub_check_overlap(&ns, sr->lower, sr->upper)) - continue; - + service_range_foreach_match(sr, service, ns.lower, ns.upper) { first = NULL; list_for_each_entry(p, &sr->all_publ, all_publ) { if (filter & TIPC_SUB_PORTS) @@ -425,7 +512,7 @@ struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type, /* Remove service range item if this was its last publication */ if (list_empty(&sr->all_publ)) { - rb_erase(&sr->tree_node, &sc->ranges); + rb_erase_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks); kfree(sr); } @@ -473,34 +560,39 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *dnode) rcu_read_lock(); sc = tipc_service_find(net, type); if (unlikely(!sc)) - goto not_found; + goto exit; spin_lock_bh(&sc->lock); - sr = tipc_service_first_range(sc, instance); - if (unlikely(!sr)) - goto no_match; - - /* Select lookup algorithm: local, closest-first or round-robin */ - if (*dnode == self) { - list = &sr->local_publ; - if (list_empty(list)) - goto no_match; - p = list_first_entry(list, struct publication, local_publ); - list_move_tail(&p->local_publ, &sr->local_publ); - } else if (legacy && !*dnode && !list_empty(&sr->local_publ)) { - list = &sr->local_publ; - p = list_first_entry(list, struct publication, local_publ); - list_move_tail(&p->local_publ, &sr->local_publ); - } else { - list = &sr->all_publ; - p = list_first_entry(list, struct publication, all_publ); - list_move_tail(&p->all_publ, &sr->all_publ); + service_range_foreach_match(sr, sc, instance, instance) { + /* Select lookup algo: local, closest-first or round-robin */ + if (*dnode == self) { + list = &sr->local_publ; + if (list_empty(list)) + continue; + p = list_first_entry(list, struct publication, + local_publ); + list_move_tail(&p->local_publ, &sr->local_publ); + } else if (legacy && !*dnode && !list_empty(&sr->local_publ)) { + list = &sr->local_publ; + p = list_first_entry(list, struct publication, + local_publ); + list_move_tail(&p->local_publ, &sr->local_publ); + } else { + list = &sr->all_publ; + p = list_first_entry(list, struct publication, + all_publ); + list_move_tail(&p->all_publ, &sr->all_publ); + } + port = p->port; + node = p->node; + /* Todo: as for legacy, pick the first matching range only, a + * "true" round-robin will be performed as needed. + */ + break; } - port = p->port; - node = p->node; -no_match: spin_unlock_bh(&sc->lock); -not_found: + +exit: rcu_read_unlock(); *dnode = node; return port; @@ -523,7 +615,8 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 scope, spin_lock_bh(&sc->lock); - sr = tipc_service_first_range(sc, instance); + /* Todo: a full search i.e. service_range_foreach_match() instead? */ + sr = service_range_match_first(sc->ranges.rb_node, instance, instance); if (!sr) goto no_match; @@ -552,7 +645,6 @@ void tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper, struct service_range *sr; struct tipc_service *sc; struct publication *p; - struct rb_node *n; rcu_read_lock(); sc = tipc_service_find(net, type); @@ -560,13 +652,7 @@ void tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper, goto exit; spin_lock_bh(&sc->lock); - - for (n = rb_first(&sc->ranges); n; n = rb_next(n)) { - sr = container_of(n, struct service_range, tree_node); - if (sr->upper < lower) - continue; - if (sr->lower > upper) - break; + service_range_foreach_match(sr, sc, lower, upper) { list_for_each_entry(p, &sr->local_publ, local_publ) { if (p->scope == scope || (!exact && p->scope < scope)) tipc_dest_push(dports, 0, p->port); @@ -587,7 +673,6 @@ void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, struct service_range *sr; struct tipc_service *sc; struct publication *p; - struct rb_node *n; rcu_read_lock(); sc = tipc_service_find(net, type); @@ -595,13 +680,7 @@ void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, goto exit; spin_lock_bh(&sc->lock); - - for (n = rb_first(&sc->ranges); n; n = rb_next(n)) { - sr = container_of(n, struct service_range, tree_node); - if (sr->upper < lower) - continue; - if (sr->lower > upper) - break; + service_range_foreach_match(sr, sc, lower, upper) { list_for_each_entry(p, &sr->all_publ, all_publ) { tipc_nlist_add(nodes, p->node); } @@ -799,7 +878,7 @@ static void tipc_service_delete(struct net *net, struct tipc_service *sc) tipc_service_remove_publ(sr, p->node, p->key); kfree_rcu(p, rcu); } - rb_erase(&sr->tree_node, &sc->ranges); + rb_erase_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks); kfree(sr); } hlist_del_init_rcu(&sc->service_list); -- 2.13.7 |
From: Tuong L. <tuo...@de...> - 2019-12-10 08:21:24
|
This series consists of some bug-fixes for TIPC. Tuong Lien (4): tipc: fix name table rbtree issues tipc: fix potential hanging after b/rcast changing tipc: fix retrans failure due to wrong destination tipc: fix use-after-free in tipc_disc_rcv() net/tipc/bcast.c | 24 +++-- net/tipc/discover.c | 6 +- net/tipc/name_table.c | 279 ++++++++++++++++++++++++++++++++------------------ net/tipc/socket.c | 32 +++--- 4 files changed, 215 insertions(+), 126 deletions(-) -- 2.13.7 |
From: <joh...@de...> - 2019-12-10 01:48:23
|
From: John Rutherford <joh...@de...> To enable iproute2/tipc to generate backwards compatible printouts and validate command parameters for nodes using a <z.c.n> node address, it needs to be able to read the legacy address flag from the kernel. The legacy address flag records the way in which the node identity was originally specified. The legacy address flag is requested by the netlink message TIPC_NL_ADDR_LEGACY_GET. If the flag is set the attribute TIPC_NLA_NET_ADDR_LEGACY is set in the return message. Signed-off-by: John Rutherford <joh...@de...> --- include/uapi/linux/tipc_netlink.h | 2 ++ net/tipc/net.c | 56 +++++++++++++++++++++++++++++++++++++++ net/tipc/net.h | 1 + net/tipc/netlink.c | 6 +++++ 4 files changed, 65 insertions(+) diff --git a/include/uapi/linux/tipc_netlink.h b/include/uapi/linux/tipc_netlink.h index 6c2194ab745b..dc0d23a50e69 100644 --- a/include/uapi/linux/tipc_netlink.h +++ b/include/uapi/linux/tipc_netlink.h @@ -65,6 +65,7 @@ enum { TIPC_NL_UDP_GET_REMOTEIP, TIPC_NL_KEY_SET, TIPC_NL_KEY_FLUSH, + TIPC_NL_ADDR_LEGACY_GET, __TIPC_NL_CMD_MAX, TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 @@ -176,6 +177,7 @@ enum { TIPC_NLA_NET_ADDR, /* u32 */ TIPC_NLA_NET_NODEID, /* u64 */ TIPC_NLA_NET_NODEID_W1, /* u64 */ + TIPC_NLA_NET_ADDR_LEGACY, /* flag */ __TIPC_NLA_NET_MAX, TIPC_NLA_NET_MAX = __TIPC_NLA_NET_MAX - 1 diff --git a/net/tipc/net.c b/net/tipc/net.c index 2de3cec9929d..85400e4242de 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -302,3 +302,59 @@ int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) return err; } + +static int __tipc_nl_addr_legacy_get(struct net *net, struct tipc_nl_msg *msg) +{ + struct tipc_net *tn = tipc_net(net); + struct nlattr *attrs; + void *hdr; + + hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, + 0, TIPC_NL_ADDR_LEGACY_GET); + if (!hdr) + return -EMSGSIZE; + + attrs = nla_nest_start(msg->skb, TIPC_NLA_NET); + if (!attrs) + goto msg_full; + + if (tn->legacy_addr_format) + if (nla_put_flag(msg->skb, TIPC_NLA_NET_ADDR_LEGACY)) + goto attr_msg_full; + + nla_nest_end(msg->skb, attrs); + genlmsg_end(msg->skb, hdr); + + return 0; + +attr_msg_full: + nla_nest_cancel(msg->skb, attrs); +msg_full: + genlmsg_cancel(msg->skb, hdr); + + return -EMSGSIZE; +} + +int tipc_nl_net_addr_legacy_get(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = sock_net(skb->sk); + struct tipc_nl_msg msg; + struct sk_buff *rep; + int err; + + rep = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!rep) + return -ENOMEM; + + msg.skb = rep; + msg.portid = info->snd_portid; + msg.seq = info->snd_seq; + + err = __tipc_nl_addr_legacy_get(net, &msg); + if (err) { + nlmsg_free(msg.skb); + return err; + } + + return genlmsg_reply(msg.skb, info); +} diff --git a/net/tipc/net.h b/net/tipc/net.h index b7f2e364eb99..6740d97c706e 100644 --- a/net/tipc/net.h +++ b/net/tipc/net.h @@ -47,5 +47,6 @@ void tipc_net_stop(struct net *net); int tipc_nl_net_dump(struct sk_buff *skb, struct netlink_callback *cb); int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info); int __tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info); +int tipc_nl_net_addr_legacy_get(struct sk_buff *skb, struct genl_info *info); #endif diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index e53231bd23b4..7c35094c20b8 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -83,6 +83,7 @@ const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = { [TIPC_NLA_NET_ADDR] = { .type = NLA_U32 }, [TIPC_NLA_NET_NODEID] = { .type = NLA_U64 }, [TIPC_NLA_NET_NODEID_W1] = { .type = NLA_U64 }, + [TIPC_NLA_NET_ADDR_LEGACY] = { .type = NLA_FLAG } }; const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = { @@ -273,6 +274,11 @@ static const struct genl_ops tipc_genl_v2_ops[] = { .doit = tipc_nl_node_flush_key, }, #endif + { + .cmd = TIPC_NL_ADDR_LEGACY_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .doit = tipc_nl_net_addr_legacy_get, + }, }; struct genl_family tipc_genl_family __ro_after_init = { -- 2.1.4 |
From: Jon M. <jon...@er...> - 2019-12-09 15:35:35
|
Acked-by: jon > -----Original Message----- > From: Tuong Lien <tuo...@de...> > Sent: 9-Dec-19 05:12 > To: tip...@li...; Jon Maloy <jon...@er...>; ma...@do...; > yin...@wi... > Subject: [net] tipc: fix use-after-free in tipc_disc_rcv() > > In the function 'tipc_disc_rcv()', the 'msg_peer_net_hash()' is called > to read the header data field but after the message skb has been freed, > that might result in a garbage value... > > This commit fixes it by defining a new local variable to store the data > first, just like the other header fields' handling. > > Fixes: f73b12812a3d ("tipc: improve throughput between nodes in netns") > Signed-off-by: Tuong Lien <tuo...@de...> > --- > net/tipc/discover.c | 6 +++--- > 1 file changed, 3 insertions(+), 3 deletions(-) > > diff --git a/net/tipc/discover.c b/net/tipc/discover.c > index b043e8c6397a..bfe43da127c0 100644 > --- a/net/tipc/discover.c > +++ b/net/tipc/discover.c > @@ -194,6 +194,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, > { > struct tipc_net *tn = tipc_net(net); > struct tipc_msg *hdr = buf_msg(skb); > + u32 pnet_hash = msg_peer_net_hash(hdr); > u16 caps = msg_node_capabilities(hdr); > bool legacy = tn->legacy_addr_format; > u32 sugg = msg_sugg_node_addr(hdr); > @@ -242,9 +243,8 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, > return; > if (!tipc_in_scope(legacy, b->domain, src)) > return; > - tipc_node_check_dest(net, src, peer_id, b, caps, signature, > - msg_peer_net_hash(hdr), &maddr, &respond, > - &dupl_addr); > + tipc_node_check_dest(net, src, peer_id, b, caps, signature, pnet_hash, > + &maddr, &respond, &dupl_addr); > if (dupl_addr) > disc_dupl_alert(b, src, &maddr); > if (!respond) > -- > 2.13.7 |
From: Tuong L. <tuo...@de...> - 2019-12-09 10:11:59
|
In the function 'tipc_disc_rcv()', the 'msg_peer_net_hash()' is called to read the header data field but after the message skb has been freed, that might result in a garbage value... This commit fixes it by defining a new local variable to store the data first, just like the other header fields' handling. Fixes: f73b12812a3d ("tipc: improve throughput between nodes in netns") Signed-off-by: Tuong Lien <tuo...@de...> --- net/tipc/discover.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/tipc/discover.c b/net/tipc/discover.c index b043e8c6397a..bfe43da127c0 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -194,6 +194,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, { struct tipc_net *tn = tipc_net(net); struct tipc_msg *hdr = buf_msg(skb); + u32 pnet_hash = msg_peer_net_hash(hdr); u16 caps = msg_node_capabilities(hdr); bool legacy = tn->legacy_addr_format; u32 sugg = msg_sugg_node_addr(hdr); @@ -242,9 +243,8 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, return; if (!tipc_in_scope(legacy, b->domain, src)) return; - tipc_node_check_dest(net, src, peer_id, b, caps, signature, - msg_peer_net_hash(hdr), &maddr, &respond, - &dupl_addr); + tipc_node_check_dest(net, src, peer_id, b, caps, signature, pnet_hash, + &maddr, &respond, &dupl_addr); if (dupl_addr) disc_dupl_alert(b, src, &maddr); if (!respond) -- 2.13.7 |
From: Xue, Y. <Yin...@wi...> - 2019-12-05 15:37:38
|
Thanks Jon for your comments. I fully agree with you. To introduce a retransmission timer for every message might be not a good idea, but introducing a retransmission timer for a block of messages might be a good idea. In most of time tipc network condition is quite good and TIPC message doesn’t need to cross internet, so RTT value should be slightly constant between two nodes. But different TIPC nodes have different HW capability, which means the RTT values might be quite different between different tipc unicast links. Particularly node’s workload is often changed from time to time, which also has impact on RTT value. In short, it looks finally we really need to dynamically measure RTT and dynamically detect message is lost or not with a retransmission timer by comparing with the adjusted RTT although you and I both dislike this approach. Using retransmission timer to detect TIPC message loss probably should be the majority approach. By contrast, SACK or NACK retransmission mechanism might be a supplementary method like fast retransmission in SCTP or TCP world. Lastly, I don’t object to this series. Instead, this series is quite good because it can bring us a large throughput improvement. We can submit request to merge the series to upstream first. Please add my ack-by if you want. After this series, we can do more experiments under quite different network environments to validate whether it’s worth introducing retransmission timer. Thanks, Ying From: Jon Maloy [mailto:jon...@er...] Sent: Thursday, December 5, 2019 4:14 AM To: Jon Maloy; Xue, Ying; xi...@re...; Tuong Tong Lien; Tung Quang Nguyen; Hoang Huu Le; John Rutherford; tip...@li...; tip...@de... Subject: RE: [net-next 1/3] tipc: eliminate gap indicator from ACK messages I tried different varieties of the solutions discussed below. 1) Ignoring 1st, 2d or 3d first NACKs, and rely on the time stamp for repeated retransmissions. 2) Ignoring 1st,2d,4th,5th and all other NACKS which are not a multiple of 3 Some gave the same throughput as with the patches I posted, but none was definitely better, as far as I could see. However, I didn’t do any long series, so there might still be some small percentage improvement I have missed. ///jon From: Jon Maloy <ma...@do...> Sent: 4-Dec-19 13:07 To: Jon Maloy <jon...@er...>; Xue, Ying <Yin...@wi...>; xi...@re...; Tuong Tong Lien <tuo...@de...>; Tung Quang Nguyen <tun...@de...>; Hoang Huu Le <hoa...@de...>; John Rutherford <joh...@de...>; tip...@li...; tip...@de... Subject: Re: [net-next 1/3] tipc: eliminate gap indicator from ACK messages Hi Ying, (cc-ing tipc-discussion, since yout original mail seems to have been dropped somewhere, and I want everybody to be able follow the discussion) Actually we do have a kind of SACK mechanism already, but maybe too simple. See below. On Wednesday, December 4, 2019, 11:45:37 AM GMT-5, Xue, Ying <yin...@wi...<mailto:yin...@wi...>> wrote: I don’t know why I received lots of complains from outlook which indicate my below email was not sent to you. I just try to resend. If you received, please ignore this one. Thanks, Ying -----Original Message----- From: Ying Xue [mailto:yin...@wi...<mailto:yin...@wi...>] Sent: Thursday, December 5, 2019 12:39 AM To: Jon Maloy; Jon Maloy Cc: moh...@er...<mailto:moh...@er...>; par...@gm...<mailto:par...@gm...>; tun...@de...<mailto:tun...@de...>; hoa...@de...<mailto:hoa...@de...>; tuo...@de...<mailto:tuo...@de...>; gor...@de...<mailto:gor...@de...>; tip...@li...<mailto:tip...@li...> Subject: Re: [net-next 1/3] tipc: eliminate gap indicator from ACK messages On 12/2/19 8:32 AM, Jon Maloy wrote: > When we increase the link send window we sometimes observe the > following scenario: > > 1) A packet #N arrives out of order far ahead of a sequence of older > packets which are still under way. The packet is added to the > deferred queue. > 2) The missing packets arrive in sequence, and for each 16th of them > an ACK is sent back to the receiver, as it should be. > 3) When building those ACK messages, it is checked if there is a gap > between the link's 'rcv_nxt' and the first packet in the deferred > queue. This is always the case until packet number #N-1 arrives, and > a 'gap' indicator is added, effectively turning them into NACK > messages. > 4) When those NACKs arrive at the sender, all the requested > retransmissions are done, since it is a first-time request. > > This sometimes leads to a huge amount of redundant retransmissions, > causing a drop in max throughput. This problem gets worse when we > in a later commit introduce variable window congestion control, > since it drops the link back to 'slow start' much more often > than necessary. > I think the essential thing is about how we should precisely detect if package is really lost. When we can identify a message is really lost, that will be good moment to ask sender to retransmit the lost message to receiver. But it's very difficult to identify this thing. When receiver finds a missing message in its receipt queue, the message has two statuses: 1. The missing message is just out of order. 2. The missing message is really lost. For the first case, when receiver detects the missing message and it sends a retransmission request before the out-of-order message arrives at the receiver, It will cause a redundant retransmission. In order to avoid this situation, we should send retransmission request as late as possible. For the second case, when the missing message is really lost, it will cause one bad thing that lots of messages are queued in sending buffer of sender side if Receiver doesn't timely send retransmission request. In order to avoid this situation, we should send retransmission request as early as possible. As a consequence, the two conclusions we drew above conflict each other. It means we cannot just use one method to perfectly solve this problem. So let's take SCTP as an example to understand how it handles this situation: In SCTP, there are two different retransmission mechanisms: one is normal retransmission mechanism, and another is called fast retransmission. In the normal retransmission algorithm, it sets a timer to detect whether message is lost or not. When a message's timer is expired, it supposes the message Is lost and the message is then retransmitted. [jon] This is what we are missing, and as a result first retransmission requests are always done. I have already observed that >90% of those are useless under normal circumstances, since they end up as duplicates. I was able to somewhat improve this figure with my first two patches in the series, but there is still a lot of duplicates. I am sure we can do better here. On a general note, I strongly dislike timers, especially if they are set per packet. Maybe we could just set the control block time stamp to e.g. N ms, and then on each link timeout check the age of the first (and oldest) buffer in the transmit queue. If the age exceeds N ms, we retransmit, and go on to check the next buffer etc. But remember, nobody is using TIPC across the internet, so delay times are short, and it is far likelier that the receiver will detect a gap and send a NACK long before this limit has expired. So, I suspect such a mechanism would be of limited value. Also note that in the last patch we actually introduce something similar in tipc_link_timeout(), although the limit will in practice be 375 ms, and I never once saw it being trigged during my testing. In the fast retransmission algorithm, three SACK messages indicate the same message is missed, the message will be immediately retransmitted before its retransmission timer is expired. [jon] We have a similar mechanism, but we based it on the skb timestamp. If a packet has been retransmitted once, we don't allow it to be retransmitted until 1 ms has elapsed, irrespective of number of NACKs. We could of course base this on the already existing retransmission counter instead. I will try that. Regarding my understanding, the normal retransmission mechanism is used to handle the first case to send retransmission request as late as possible. The fast retransmission algorithm is used to handle the second case to send retransmission request earlier than retransmission timer expiration. Therefore, probably we can use its ideas to handle our TIPC cases: 1. We can introduce a retransmission timer to perform delayed retransmission. 2. We should remain the gap indicator. But when receiver reports a gap to sender, it doesn't send NACK to sender. Instead it should SACK to sender. [jon] All our NACKs are in reality SACKs. It is up to the receiver to interpret them correctly. Whenever sender receives NACK, it will immediately retransmit missed messages, which probably cause lots of duplicated messages. But for SACK, It means Selective ACK which contains gap info. But when a sender receives SACK and gap info, it doesn't immediately retransmit the missed message. Instead, it releases the messages in its sending queue which have been acknowledged by SACK message. Meanwhile, once three SACK messages report the same message Is missed, the sender can immediately retransmit the message, which is exactly what the fast retransmission algorithm is doing. Of course, the two suggested approaches are much more complex than our current TIPC link traffic control algorithm. If we decide to introduce retransmission timer, we have to introduce another algorithm to measure RTT time. I am not sure whether we should introduce retransmission timer, but in my opinion, introducing SACK might be a good idea to TIPC. [jon] Once we have an RTT measuring mechanism in place, we should be able to calculate an optimal expiration time according to the above suggestion. But I am skeptical to its practical value, at least the way TIPC is used now. But I do agree that we need a mechanism for limiting the number of first retransmission attempts, which constitutes the vast majority of retransmissions, and to it vast majority are useless. I think I will first try leveraging the skb retransmission counter here, and see what will happen. ///jon About SCTP SACK, please refer to: https://tools.ietf.org/html/rfc4960#page-34 Thanks, Ying > We now fix this by not sending any 'gap' indicator in regular ACK > messages. We already have a mechanism for sending explicit NACKs > in place, and this is sufficient to keep up the packet flow. > > Signed-off-by: Jon Maloy <jon...@er...<mailto:jon...@er...>> > --- > net/tipc/link.c | 14 +++++++++----- > 1 file changed, 9 insertions(+), 5 deletions(-) > > diff --git a/net/tipc/link.c b/net/tipc/link.c > index 24d4d10..6d86446 100644 > --- a/net/tipc/link.c > +++ b/net/tipc/link.c > @@ -1521,7 +1521,8 @@ static int tipc_link_build_nack_msg(struct tipc_link *l, > struct sk_buff_head *xmitq) > { > u32 def_cnt = ++l->stats.deferred_recv; > - u32 defq_len = skb_queue_len(&l->deferdq); > + struct sk_buff_head *dfq = &l->deferdq; > + u32 defq_len = skb_queue_len(dfq); > int match1, match2; > > if (link_is_bc_rcvlink(l)) { > @@ -1532,8 +1533,12 @@ static int tipc_link_build_nack_msg(struct tipc_link *l, > return 0; > } > > - if (defq_len >= 3 && !((defq_len - 3) % 16)) > - tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, 0, xmitq); > + if (defq_len >= 3 && !((defq_len - 3) % 16)) { > + u16 rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt; > + > + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, > + rcvgap, 0, 0, xmitq); > + } > return 0; > } > > @@ -1631,7 +1636,7 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, > if (!tipc_link_is_up(l) && (mtyp == STATE_MSG)) > return; > > - if (!skb_queue_empty(dfq)) > + if ((probe || probe_reply) && !skb_queue_empty(dfq)) > rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt; > > skb = tipc_msg_create(LINK_PROTOCOL, mtyp, INT_H_SIZE, > @@ -2079,7 +2084,6 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, > if (rcvgap || reply) > tipc_link_build_proto_msg(l, STATE_MSG, 0, reply, > rcvgap, 0, 0, xmitq); > - > rc |= tipc_link_advance_transmq(l, ack, gap, ga, xmitq); > > /* If NACK, retransmit will now start at right position */ > |
From: Jon M. <jon...@er...> - 2019-12-04 20:14:37
|
I tried different varieties of the solutions discussed below. 1. Ignoring 1st, 2d or 3d first NACKs, and rely on the time stamp for repeated retransmissions. 2. Ignoring 1st,2d,4th,5th and all other NACKS which are not a multiple of 3 Some gave the same throughput as with the patches I posted, but none was definitely better, as far as I could see. However, I didn’t do any long series, so there might still be some small percentage improvement I have missed. ///jon From: Jon Maloy <ma...@do...> Sent: 4-Dec-19 13:07 To: Jon Maloy <jon...@er...>; Xue, Ying <Yin...@wi...>; xi...@re...; Tuong Tong Lien <tuo...@de...>; Tung Quang Nguyen <tun...@de...>; Hoang Huu Le <hoa...@de...>; John Rutherford <joh...@de...>; tip...@li...; tip...@de... Subject: Re: [net-next 1/3] tipc: eliminate gap indicator from ACK messages Hi Ying, (cc-ing tipc-discussion, since yout original mail seems to have been dropped somewhere, and I want everybody to be able follow the discussion) Actually we do have a kind of SACK mechanism already, but maybe too simple. See below. On Wednesday, December 4, 2019, 11:45:37 AM GMT-5, Xue, Ying <yin...@wi...<mailto:yin...@wi...>> wrote: I don’t know why I received lots of complains from outlook which indicate my below email was not sent to you. I just try to resend. If you received, please ignore this one. Thanks, Ying -----Original Message----- From: Ying Xue [mailto:yin...@wi...<mailto:yin...@wi...>] Sent: Thursday, December 5, 2019 12:39 AM To: Jon Maloy; Jon Maloy Cc: moh...@er...<mailto:moh...@er...>; par...@gm...<mailto:par...@gm...>; tun...@de...<mailto:tun...@de...>; hoa...@de...<mailto:hoa...@de...>; tuo...@de...<mailto:tuo...@de...>; gor...@de...<mailto:gor...@de...>; tip...@li...<mailto:tip...@li...> Subject: Re: [net-next 1/3] tipc: eliminate gap indicator from ACK messages On 12/2/19 8:32 AM, Jon Maloy wrote: > When we increase the link send window we sometimes observe the > following scenario: > > 1) A packet #N arrives out of order far ahead of a sequence of older > packets which are still under way. The packet is added to the > deferred queue. > 2) The missing packets arrive in sequence, and for each 16th of them > an ACK is sent back to the receiver, as it should be. > 3) When building those ACK messages, it is checked if there is a gap > between the link's 'rcv_nxt' and the first packet in the deferred > queue. This is always the case until packet number #N-1 arrives, and > a 'gap' indicator is added, effectively turning them into NACK > messages. > 4) When those NACKs arrive at the sender, all the requested > retransmissions are done, since it is a first-time request. > > This sometimes leads to a huge amount of redundant retransmissions, > causing a drop in max throughput. This problem gets worse when we > in a later commit introduce variable window congestion control, > since it drops the link back to 'slow start' much more often > than necessary. > I think the essential thing is about how we should precisely detect if package is really lost. When we can identify a message is really lost, that will be good moment to ask sender to retransmit the lost message to receiver. But it's very difficult to identify this thing. When receiver finds a missing message in its receipt queue, the message has two statuses: 1. The missing message is just out of order. 2. The missing message is really lost. For the first case, when receiver detects the missing message and it sends a retransmission request before the out-of-order message arrives at the receiver, It will cause a redundant retransmission. In order to avoid this situation, we should send retransmission request as late as possible. For the second case, when the missing message is really lost, it will cause one bad thing that lots of messages are queued in sending buffer of sender side if Receiver doesn't timely send retransmission request. In order to avoid this situation, we should send retransmission request as early as possible. As a consequence, the two conclusions we drew above conflict each other. It means we cannot just use one method to perfectly solve this problem. So let's take SCTP as an example to understand how it handles this situation: In SCTP, there are two different retransmission mechanisms: one is normal retransmission mechanism, and another is called fast retransmission. In the normal retransmission algorithm, it sets a timer to detect whether message is lost or not. When a message's timer is expired, it supposes the message Is lost and the message is then retransmitted. [jon] This is what we are missing, and as a result first retransmission requests are always done. I have already observed that >90% of those are useless under normal circumstances, since they end up as duplicates. I was able to somewhat improve this figure with my first two patches in the series, but there is still a lot of duplicates. I am sure we can do better here. On a general note, I strongly dislike timers, especially if they are set per packet. Maybe we could just set the control block time stamp to e.g. N ms, and then on each link timeout check the age of the first (and oldest) buffer in the transmit queue. If the age exceeds N ms, we retransmit, and go on to check the next buffer etc. But remember, nobody is using TIPC across the internet, so delay times are short, and it is far likelier that the receiver will detect a gap and send a NACK long before this limit has expired. So, I suspect such a mechanism would be of limited value. Also note that in the last patch we actually introduce something similar in tipc_link_timeout(), although the limit will in practice be 375 ms, and I never once saw it being trigged during my testing. In the fast retransmission algorithm, three SACK messages indicate the same message is missed, the message will be immediately retransmitted before its retransmission timer is expired. [jon] We have a similar mechanism, but we based it on the skb timestamp. If a packet has been retransmitted once, we don't allow it to be retransmitted until 1 ms has elapsed, irrespective of number of NACKs. We could of course base this on the already existing retransmission counter instead. I will try that. Regarding my understanding, the normal retransmission mechanism is used to handle the first case to send retransmission request as late as possible. The fast retransmission algorithm is used to handle the second case to send retransmission request earlier than retransmission timer expiration. Therefore, probably we can use its ideas to handle our TIPC cases: 1. We can introduce a retransmission timer to perform delayed retransmission. 2. We should remain the gap indicator. But when receiver reports a gap to sender, it doesn't send NACK to sender. Instead it should SACK to sender. [jon] All our NACKs are in reality SACKs. It is up to the receiver to interpret them correctly. Whenever sender receives NACK, it will immediately retransmit missed messages, which probably cause lots of duplicated messages. But for SACK, It means Selective ACK which contains gap info. But when a sender receives SACK and gap info, it doesn't immediately retransmit the missed message. Instead, it releases the messages in its sending queue which have been acknowledged by SACK message. Meanwhile, once three SACK messages report the same message Is missed, the sender can immediately retransmit the message, which is exactly what the fast retransmission algorithm is doing. Of course, the two suggested approaches are much more complex than our current TIPC link traffic control algorithm. If we decide to introduce retransmission timer, we have to introduce another algorithm to measure RTT time. I am not sure whether we should introduce retransmission timer, but in my opinion, introducing SACK might be a good idea to TIPC. [jon] Once we have an RTT measuring mechanism in place, we should be able to calculate an optimal expiration time according to the above suggestion. But I am skeptical to its practical value, at least the way TIPC is used now. But I do agree that we need a mechanism for limiting the number of first retransmission attempts, which constitutes the vast majority of retransmissions, and to it vast majority are useless. I think I will first try leveraging the skb retransmission counter here, and see what will happen. ///jon About SCTP SACK, please refer to: https://tools.ietf.org/html/rfc4960#page-34 Thanks, Ying > We now fix this by not sending any 'gap' indicator in regular ACK > messages. We already have a mechanism for sending explicit NACKs > in place, and this is sufficient to keep up the packet flow. > > Signed-off-by: Jon Maloy <jon...@er...<mailto:jon...@er...>> > --- > net/tipc/link.c | 14 +++++++++----- > 1 file changed, 9 insertions(+), 5 deletions(-) > > diff --git a/net/tipc/link.c b/net/tipc/link.c > index 24d4d10..6d86446 100644 > --- a/net/tipc/link.c > +++ b/net/tipc/link.c > @@ -1521,7 +1521,8 @@ static int tipc_link_build_nack_msg(struct tipc_link *l, > struct sk_buff_head *xmitq) > { > u32 def_cnt = ++l->stats.deferred_recv; > - u32 defq_len = skb_queue_len(&l->deferdq); > + struct sk_buff_head *dfq = &l->deferdq; > + u32 defq_len = skb_queue_len(dfq); > int match1, match2; > > if (link_is_bc_rcvlink(l)) { > @@ -1532,8 +1533,12 @@ static int tipc_link_build_nack_msg(struct tipc_link *l, > return 0; > } > > - if (defq_len >= 3 && !((defq_len - 3) % 16)) > - tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, 0, xmitq); > + if (defq_len >= 3 && !((defq_len - 3) % 16)) { > + u16 rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt; > + > + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, > + rcvgap, 0, 0, xmitq); > + } > return 0; > } > > @@ -1631,7 +1636,7 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, > if (!tipc_link_is_up(l) && (mtyp == STATE_MSG)) > return; > > - if (!skb_queue_empty(dfq)) > + if ((probe || probe_reply) && !skb_queue_empty(dfq)) > rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt; > > skb = tipc_msg_create(LINK_PROTOCOL, mtyp, INT_H_SIZE, > @@ -2079,7 +2084,6 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, > if (rcvgap || reply) > tipc_link_build_proto_msg(l, STATE_MSG, 0, reply, > rcvgap, 0, 0, xmitq); > - > rc |= tipc_link_advance_transmq(l, ack, gap, ga, xmitq); > > /* If NACK, retransmit will now start at right position */ > |
From: Jon M. <ma...@do...> - 2019-12-04 18:06:58
|
Hi Ying, (cc-ing tipc-discussion, since yout original mail seems to have been dropped somewhere, and I want everybody to be able follow the discussion) Actually we do have a kind of SACK mechanism already, but maybe too simple.See below. On Wednesday, December 4, 2019, 11:45:37 AM GMT-5, Xue, Ying <yin...@wi...> wrote: I don’t know why I received lots of complains from outlook which indicate my below email was not sent to you. I just try to resend. If you received, please ignore this one. Thanks, Ying -----Original Message----- From: Ying Xue [mailto:yin...@wi...] Sent: Thursday, December 5, 2019 12:39 AM To: Jon Maloy; Jon Maloy Cc: moh...@er...; par...@gm...; tun...@de...; hoa...@de...; tuo...@de...; gor...@de...; tip...@li... Subject: Re: [net-next 1/3] tipc: eliminate gap indicator from ACK messages On 12/2/19 8:32 AM, Jon Maloy wrote: > When we increase the link send window we sometimes observe the > following scenario: > > 1) A packet #N arrives out of order far ahead of a sequence of older > packets which are still under way. The packet is added to the > deferred queue. > 2) The missing packets arrive in sequence, and for each 16th of them > an ACK is sent back to the receiver, as it should be. > 3) When building those ACK messages, it is checked if there is a gap > between the link's 'rcv_nxt' and the first packet in the deferred > queue. This is always the case until packet number #N-1 arrives, and > a 'gap' indicator is added, effectively turning them into NACK > messages. > 4) When those NACKs arrive at the sender, all the requested > retransmissions are done, since it is a first-time request. > > This sometimes leads to a huge amount of redundant retransmissions, > causing a drop in max throughput. This problem gets worse when we > in a later commit introduce variable window congestion control, > since it drops the link back to 'slow start' much more often > than necessary. > I think the essential thing is about how we should precisely detect if package is really lost. When we can identify a message is really lost, that will be good moment to ask sender to retransmit the lost message to receiver. But it's very difficult to identify this thing. When receiver finds a missing message in its receipt queue, the message has two statuses: 1. The missing message is just out of order. 2. The missing message is really lost. For the first case, when receiver detects the missing message and it sends a retransmission request before the out-of-order message arrives at the receiver, It will cause a redundant retransmission. In order to avoid this situation, we should send retransmission request as late as possible. For the second case, when the missing message is really lost, it will cause one bad thing that lots of messages are queued in sending buffer of sender side if Receiver doesn't timely send retransmission request. In order to avoid this situation, we should send retransmission request as early as possible. As a consequence, the two conclusions we drew above conflict each other. It means we cannot just use one method to perfectly solve this problem. So let's take SCTP as an example to understand how it handles this situation: In SCTP, there are two different retransmission mechanisms: one is normal retransmission mechanism, and another is called fast retransmission. In the normal retransmission algorithm, it sets a timer to detect whether message is lost or not. When a message's timer is expired, it supposes the message Is lost and the message is then retransmitted. [jon] This is what we are missing, and as a result first retransmission requests are always done. I have already observed that >90% of those are useless under normal circumstances, since they end up as duplicates. I was able to somewhat improve this figure with my first two patches in the series, but there is still a lot of duplicates.I am sure we can do better here.On a general note, I strongly dislike timers, especially if they are set per packet. Maybe we could just set the control block time stamp to e.g. N ms, and then on each link timeout check the age of the first (and oldest) buffer in the transmit queue. If the age exceeds N ms, we retransmit, and go on to check the next buffer etc.But remember, nobody is using TIPC across the internet, so delay times are short, and it is far likelier that the receiver will detect a gap and send a NACK long before this limit has expired. So, I suspect such a mechanism would be of limited value. Also note that in the last patch we actually introduce something similar in tipc_link_timeout(), although the limit will in practice be 375 ms, and I never once saw it being trigged during my testing. In the fast retransmission algorithm, three SACK messages indicate the same message is missed, the message will be immediately retransmitted before its retransmission timer is expired. [jon] We have a similar mechanism, but we based it on the skb timestamp. If a packet has been retransmitted once, we don't allow it to be retransmitted until 1 ms has elapsed, irrespective of number of NACKs. We could of course base this on the already existing retransmission counter instead. I will try that. Regarding my understanding, the normal retransmission mechanism is used to handle the first case to send retransmission request as late as possible. The fast retransmission algorithm is used to handle the second case to send retransmission request earlier than retransmission timer expiration. Therefore, probably we can use its ideas to handle our TIPC cases: 1. We can introduce a retransmission timer to perform delayed retransmission. 2. We should remain the gap indicator. But when receiver reports a gap to sender, it doesn't send NACK to sender. Instead it should SACK to sender. [jon] All our NACKs are in reality SACKs. It is up to the receiver to interpret them correctly. Whenever sender receives NACK, it will immediately retransmit missed messages, which probably cause lots of duplicated messages. But for SACK, It means Selective ACK which contains gap info. But when a sender receives SACK and gap info, it doesn't immediately retransmit the missed message. Instead, it releases the messages in its sending queue which have been acknowledged by SACK message. Meanwhile, once three SACK messages report the same message Is missed, the sender can immediately retransmit the message, which is exactly what the fast retransmission algorithm is doing. Of course, the two suggested approaches are much more complex than our current TIPC link traffic control algorithm. If we decide to introduce retransmission timer, we have to introduce another algorithm to measure RTT time. I am not sure whether we should introduce retransmission timer, but in my opinion, introducing SACK might be a good idea to TIPC. [jon] Once we have an RTT measuring mechanism in place, we should be able to calculate an optimal expiration time according to the above suggestion. But I am skeptical to its practical value, at least the way TIPC is used now.But I do agree that we need a mechanism for limiting the number of first retransmission attempts, which constitutes the vast majority of retransmissions, and to it vast majority are useless. I think I will first try leveraging the skb retransmission counter here, and see what will happen. ///jon About SCTP SACK, please refer to: https://tools.ietf.org/html/rfc4960#page-34 Thanks, Ying > We now fix this by not sending any 'gap' indicator in regular ACK > messages. We already have a mechanism for sending explicit NACKs > in place, and this is sufficient to keep up the packet flow. > > Signed-off-by: Jon Maloy <jon...@er...> > --- > net/tipc/link.c | 14 +++++++++----- > 1 file changed, 9 insertions(+), 5 deletions(-) > > diff --git a/net/tipc/link.c b/net/tipc/link.c > index 24d4d10..6d86446 100644 > --- a/net/tipc/link.c > +++ b/net/tipc/link.c > @@ -1521,7 +1521,8 @@ static int tipc_link_build_nack_msg(struct tipc_link *l, > struct sk_buff_head *xmitq) > { > u32 def_cnt = ++l->stats.deferred_recv; > - u32 defq_len = skb_queue_len(&l->deferdq); > + struct sk_buff_head *dfq = &l->deferdq; > + u32 defq_len = skb_queue_len(dfq); > int match1, match2; > > if (link_is_bc_rcvlink(l)) { > @@ -1532,8 +1533,12 @@ static int tipc_link_build_nack_msg(struct tipc_link *l, > return 0; > } > > - if (defq_len >= 3 && !((defq_len - 3) % 16)) > - tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, 0, xmitq); > + if (defq_len >= 3 && !((defq_len - 3) % 16)) { > + u16 rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt; > + > + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, > + rcvgap, 0, 0, xmitq); > + } > return 0; > } > > @@ -1631,7 +1636,7 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, > if (!tipc_link_is_up(l) && (mtyp == STATE_MSG)) > return; > > - if (!skb_queue_empty(dfq)) > + if ((probe || probe_reply) && !skb_queue_empty(dfq)) > rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt; > > skb = tipc_msg_create(LINK_PROTOCOL, mtyp, INT_H_SIZE, > @@ -2079,7 +2084,6 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, > if (rcvgap || reply) > tipc_link_build_proto_msg(l, STATE_MSG, 0, reply, > rcvgap, 0, 0, xmitq); > - > rc |= tipc_link_advance_transmq(l, ack, gap, ga, xmitq); > > /* If NACK, retransmit will now start at right position */ > |
From: Jon M. <jon...@er...> - 2019-12-04 04:04:12
|
A little ugly, but I have no better idea at the moment. Acked-by: Jon > -----Original Message----- > From: Tuong Lien <tuo...@de...> > Sent: 25-Nov-19 23:07 > To: tip...@li...; Jon Maloy <jon...@er...>; ma...@do...; > yin...@wi... > Subject: [PATCH RFC] tipc: fix potential hanging after b/rcast changing > > In commit c55c8edafa91 ("tipc: smooth change between replicast and > broadcast"), we allow instant switching between replicast and broadcast > by sending a dummy 'SYN' packet on the last used link to synchronize > packets on the links. The 'SYN' message is an object of link congestion > also, so if that happens, a 'SOCK_WAKEUP' will be scheduled to be sent > back to the socket... > However, in that commit, we simply use the same socket 'cong_link_cnt' > counter for both the 'SYN' & normal payload message sending. Therefore, > if both the replicast & broadcast links are congested, the counter will > be not updated correctly but overwritten by the latter congestion. > Later on, when the 'SOCK_WAKEUP' messages are processed, the counter is > reduced one by one and eventually overflowed. Consequently, further > activities on the socket will only wait for the false congestion signal > to disappear but never been met. > > Because sending the 'SYN' message is vital for the mechanism, it should > be done anyway. This commit fixes the issue by marking the message with > an error code e.g. 'TIPC_ERR_NO_PORT', so its sending should not face a > link congestion, there is no need to touch the socket 'cong_link_cnt' > either. In addition, in the event of any error (e.g. -ENOBUFS), we will > purge the entire payload message queue and make a return immediately. > > Fixes: c55c8edafa91 ("tipc: smooth change between replicast and broadcast") > Signed-off-by: Tuong Lien <tuo...@de...> > --- > net/tipc/bcast.c | 24 +++++++++++++++--------- > 1 file changed, 15 insertions(+), 9 deletions(-) > > diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c > index 55aeba681cf4..656ebc79c64e 100644 > --- a/net/tipc/bcast.c > +++ b/net/tipc/bcast.c > @@ -305,17 +305,17 @@ static int tipc_rcast_xmit(struct net *net, struct sk_buff_head *pkts, > * @skb: socket buffer to copy > * @method: send method to be used > * @dests: destination nodes for message. > - * @cong_link_cnt: returns number of encountered congested destination links > * Returns 0 if success, otherwise errno > */ > static int tipc_mcast_send_sync(struct net *net, struct sk_buff *skb, > struct tipc_mc_method *method, > - struct tipc_nlist *dests, > - u16 *cong_link_cnt) > + struct tipc_nlist *dests) > { > struct tipc_msg *hdr, *_hdr; > struct sk_buff_head tmpq; > struct sk_buff *_skb; > + u16 cong_link_cnt; > + int rc = 0; > > /* Is a cluster supporting with new capabilities ? */ > if (!(tipc_net(net)->capabilities & TIPC_MCAST_RBCTL)) > @@ -343,18 +343,19 @@ static int tipc_mcast_send_sync(struct net *net, struct sk_buff *skb, > _hdr = buf_msg(_skb); > msg_set_size(_hdr, MCAST_H_SIZE); > msg_set_is_rcast(_hdr, !msg_is_rcast(hdr)); > + msg_set_errcode(_hdr, TIPC_ERR_NO_PORT); > > __skb_queue_head_init(&tmpq); > __skb_queue_tail(&tmpq, _skb); > if (method->rcast) > - tipc_bcast_xmit(net, &tmpq, cong_link_cnt); > + rc = tipc_bcast_xmit(net, &tmpq, &cong_link_cnt); > else > - tipc_rcast_xmit(net, &tmpq, dests, cong_link_cnt); > + rc = tipc_rcast_xmit(net, &tmpq, dests, &cong_link_cnt); > > /* This queue should normally be empty by now */ > __skb_queue_purge(&tmpq); > > - return 0; > + return rc; > } > > /* tipc_mcast_xmit - deliver message to indicated destination nodes > @@ -396,9 +397,14 @@ int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts, > msg_set_is_rcast(hdr, method->rcast); > > /* Switch method ? */ > - if (rcast != method->rcast) > - tipc_mcast_send_sync(net, skb, method, > - dests, cong_link_cnt); > + if (rcast != method->rcast) { > + rc = tipc_mcast_send_sync(net, skb, method, dests); > + if (unlikely(rc)) { > + pr_err("Unable to send SYN: method %d, rc %d\n", > + rcast, rc); > + goto exit; > + } > + } > > if (method->rcast) > rc = tipc_rcast_xmit(net, pkts, dests, cong_link_cnt); > -- > 2.13.7 |
From: Jon M. <jon...@er...> - 2019-12-02 20:00:38
|
Acked-by: Jon > -----Original Message----- > From: Tuong Lien <tuo...@de...> > Sent: 30-Nov-19 05:19 > To: tip...@li...; Jon Maloy <jon...@er...>; ma...@do...; > yin...@wi... > Subject: [PATCH RFC] tipc: fix retrans failure due to wrong destination > > When a user message is sent, TIPC will check if the socket has faced a > congestion at link layer. If that happens, it will make a sleep to wait > for the congestion to disappear. This leaves a gap for other users to > take over the socket (e.g. multi threads) since the socket is released > as well. Also, in case of connectionless (e.g. SOCK_RDM), user is free > to send messages to various destinations (e.g. via 'sendto()'), then > the socket's preformatted header has to be updated correspondingly > prior to the actual payload message building. > > Unfortunately, the latter action is done before the first action which > causes a condition issue that the destination of a certain message can > be modified incorrectly in the middle, leading to wrong destination > when that message is built. Consequently, when the message is sent to > the link layer, it gets stuck there forever because the peer node will > simply reject it. After a number of retransmission attempts, the link > is eventually taken down and the retransmission failure is reported. > > This commit fixes the problem by rearranging the order of actions to > prevent the race condition from occurring, so the message building is > 'atomic' and its header will not be modified by anyone. > > Fixes: 365ad353c256 ("tipc: reduce risk of user starvation during link congestion") > Signed-off-by: Tuong Lien <tuo...@de...> > --- > net/tipc/socket.c | 32 ++++++++++++++++++-------------- > 1 file changed, 18 insertions(+), 14 deletions(-) > > diff --git a/net/tipc/socket.c b/net/tipc/socket.c > index a1c8d722ca20..9b0280a562a4 100644 > --- a/net/tipc/socket.c > +++ b/net/tipc/socket.c > @@ -1361,8 +1361,8 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) > struct tipc_msg *hdr = &tsk->phdr; > struct tipc_name_seq *seq; > struct sk_buff_head pkts; > - u32 dport, dnode = 0; > - u32 type, inst; > + u32 dport = 0, dnode = 0; > + u32 type = 0, inst = 0; > int mtu, rc; > > if (unlikely(dlen > TIPC_MAX_USER_MSG_SIZE)) > @@ -1415,23 +1415,11 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) > type = dest->addr.name.name.type; > inst = dest->addr.name.name.instance; > dnode = dest->addr.name.domain; > - msg_set_type(hdr, TIPC_NAMED_MSG); > - msg_set_hdr_sz(hdr, NAMED_H_SIZE); > - msg_set_nametype(hdr, type); > - msg_set_nameinst(hdr, inst); > - msg_set_lookup_scope(hdr, tipc_node2scope(dnode)); > dport = tipc_nametbl_translate(net, type, inst, &dnode); > - msg_set_destnode(hdr, dnode); > - msg_set_destport(hdr, dport); > if (unlikely(!dport && !dnode)) > return -EHOSTUNREACH; > } else if (dest->addrtype == TIPC_ADDR_ID) { > dnode = dest->addr.id.node; > - msg_set_type(hdr, TIPC_DIRECT_MSG); > - msg_set_lookup_scope(hdr, 0); > - msg_set_destnode(hdr, dnode); > - msg_set_destport(hdr, dest->addr.id.ref); > - msg_set_hdr_sz(hdr, BASIC_H_SIZE); > } else { > return -EINVAL; > } > @@ -1442,6 +1430,22 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) > if (unlikely(rc)) > return rc; > > + if (dest->addrtype == TIPC_ADDR_NAME) { > + msg_set_type(hdr, TIPC_NAMED_MSG); > + msg_set_hdr_sz(hdr, NAMED_H_SIZE); > + msg_set_nametype(hdr, type); > + msg_set_nameinst(hdr, inst); > + msg_set_lookup_scope(hdr, tipc_node2scope(dnode)); > + msg_set_destnode(hdr, dnode); > + msg_set_destport(hdr, dport); > + } else { /* TIPC_ADDR_ID */ > + msg_set_type(hdr, TIPC_DIRECT_MSG); > + msg_set_lookup_scope(hdr, 0); > + msg_set_destnode(hdr, dnode); > + msg_set_destport(hdr, dest->addr.id.ref); > + msg_set_hdr_sz(hdr, BASIC_H_SIZE); > + } > + > __skb_queue_head_init(&pkts); > mtu = tipc_node_get_mtu(net, dnode, tsk->portid, false); > rc = tipc_msg_build(hdr, m, 0, dlen, mtu, &pkts); > -- > 2.13.7 |
From: Jon M. <ma...@do...> - 2019-12-02 19:39:00
|
Acked-by: Jon On Monday, November 25, 2019, 03:12:45 AM GMT-5, Tuong Lien <tuo...@de...> wrote: In case of stream sockets, a per-socket timer is set up for either the SYN attempt or connection supervision mechanism. When the socket timer expires, an appropriate action (i.e. sending SYN or PROBE message) would be taken just in the case the socket is not being owned by user (e.g. via the 'lock_sock()'). In the latter case, there is nothing but the timer is re-scheduled for a short period of time (~ 50ms) to try again. The function just makes a 'return' immediately without decreasing the socket 'refcnt' which was held in advance for the timer callback itself! The same happens if at the next time, the socket is still busy... As a result, the socket 'refcnt' is increased without decreasing, so the sock object cannot be freed at all (due to 'refcnt' != 0) even when the connection is closed and user releases all related resources. The memory leak is hard to detect because the probe interval is set to 1 hour since the connection is established, but in the case of a SYN attempt, that can be much more likely. The commit fixes the bug by calling the 'sock_put()' in the case mentioned above, then the socket 'refcnt' will be increased & decreased correctly and the sock object can be released later. Fixes: 0d5fcebf3c37 ("tipc: refactor tipc_sk_timeout() function") Signed-off-by: Tuong Lien <tuo...@de...> --- net/tipc/socket.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index a1c8d722ca20..d67c3747d2c3 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2757,7 +2757,7 @@ static void tipc_sk_timeout(struct timer_list *t) if (sock_owned_by_user(sk)) { sk_reset_timer(sk, &sk->sk_timer, jiffies + HZ / 20); bh_unlock_sock(sk); - return; + goto exit; } if (sk->sk_state == TIPC_ESTABLISHED) @@ -2775,6 +2775,8 @@ static void tipc_sk_timeout(struct timer_list *t) tipc_dest_push(&tsk->cong_links, pnode, 0); tsk->cong_link_cnt = 1; } + +exit: sock_put(sk); } -- 2.13.7 |