You can subscribe to this list here.
2003 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
(6) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2004 |
Jan
(9) |
Feb
(11) |
Mar
(22) |
Apr
(73) |
May
(78) |
Jun
(146) |
Jul
(80) |
Aug
(27) |
Sep
(5) |
Oct
(14) |
Nov
(18) |
Dec
(27) |
2005 |
Jan
(20) |
Feb
(30) |
Mar
(19) |
Apr
(28) |
May
(50) |
Jun
(31) |
Jul
(32) |
Aug
(14) |
Sep
(36) |
Oct
(43) |
Nov
(74) |
Dec
(63) |
2006 |
Jan
(34) |
Feb
(32) |
Mar
(21) |
Apr
(76) |
May
(106) |
Jun
(72) |
Jul
(70) |
Aug
(175) |
Sep
(130) |
Oct
(39) |
Nov
(81) |
Dec
(43) |
2007 |
Jan
(81) |
Feb
(36) |
Mar
(20) |
Apr
(43) |
May
(54) |
Jun
(34) |
Jul
(44) |
Aug
(55) |
Sep
(44) |
Oct
(54) |
Nov
(43) |
Dec
(41) |
2008 |
Jan
(42) |
Feb
(84) |
Mar
(73) |
Apr
(30) |
May
(119) |
Jun
(54) |
Jul
(54) |
Aug
(93) |
Sep
(173) |
Oct
(130) |
Nov
(145) |
Dec
(153) |
2009 |
Jan
(59) |
Feb
(12) |
Mar
(28) |
Apr
(18) |
May
(56) |
Jun
(9) |
Jul
(28) |
Aug
(62) |
Sep
(16) |
Oct
(19) |
Nov
(15) |
Dec
(17) |
2010 |
Jan
(14) |
Feb
(36) |
Mar
(37) |
Apr
(30) |
May
(33) |
Jun
(53) |
Jul
(42) |
Aug
(50) |
Sep
(67) |
Oct
(66) |
Nov
(69) |
Dec
(36) |
2011 |
Jan
(52) |
Feb
(45) |
Mar
(49) |
Apr
(21) |
May
(34) |
Jun
(13) |
Jul
(19) |
Aug
(37) |
Sep
(43) |
Oct
(10) |
Nov
(23) |
Dec
(30) |
2012 |
Jan
(42) |
Feb
(36) |
Mar
(46) |
Apr
(25) |
May
(96) |
Jun
(146) |
Jul
(40) |
Aug
(28) |
Sep
(61) |
Oct
(45) |
Nov
(100) |
Dec
(53) |
2013 |
Jan
(79) |
Feb
(24) |
Mar
(134) |
Apr
(156) |
May
(118) |
Jun
(75) |
Jul
(278) |
Aug
(145) |
Sep
(136) |
Oct
(168) |
Nov
(137) |
Dec
(439) |
2014 |
Jan
(284) |
Feb
(158) |
Mar
(231) |
Apr
(275) |
May
(259) |
Jun
(91) |
Jul
(222) |
Aug
(215) |
Sep
(165) |
Oct
(166) |
Nov
(211) |
Dec
(150) |
2015 |
Jan
(164) |
Feb
(324) |
Mar
(299) |
Apr
(214) |
May
(111) |
Jun
(109) |
Jul
(105) |
Aug
(36) |
Sep
(58) |
Oct
(131) |
Nov
(68) |
Dec
(30) |
2016 |
Jan
(46) |
Feb
(87) |
Mar
(135) |
Apr
(174) |
May
(132) |
Jun
(135) |
Jul
(149) |
Aug
(125) |
Sep
(79) |
Oct
(49) |
Nov
(95) |
Dec
(102) |
2017 |
Jan
(104) |
Feb
(75) |
Mar
(72) |
Apr
(53) |
May
(18) |
Jun
(5) |
Jul
(14) |
Aug
(19) |
Sep
(2) |
Oct
(13) |
Nov
(21) |
Dec
(67) |
2018 |
Jan
(56) |
Feb
(50) |
Mar
(148) |
Apr
(41) |
May
(37) |
Jun
(34) |
Jul
(34) |
Aug
(11) |
Sep
(52) |
Oct
(48) |
Nov
(28) |
Dec
(46) |
2019 |
Jan
(29) |
Feb
(63) |
Mar
(95) |
Apr
(54) |
May
(14) |
Jun
(71) |
Jul
(60) |
Aug
(49) |
Sep
(3) |
Oct
(64) |
Nov
(115) |
Dec
(57) |
2020 |
Jan
(15) |
Feb
(9) |
Mar
(38) |
Apr
(27) |
May
(60) |
Jun
(53) |
Jul
(35) |
Aug
(46) |
Sep
(37) |
Oct
(64) |
Nov
(20) |
Dec
(25) |
2021 |
Jan
(20) |
Feb
(31) |
Mar
(27) |
Apr
(23) |
May
(21) |
Jun
(30) |
Jul
(30) |
Aug
(7) |
Sep
(18) |
Oct
|
Nov
(15) |
Dec
(4) |
2022 |
Jan
(3) |
Feb
(1) |
Mar
(10) |
Apr
|
May
(2) |
Jun
(26) |
Jul
(5) |
Aug
|
Sep
(1) |
Oct
(2) |
Nov
(9) |
Dec
(2) |
2023 |
Jan
(4) |
Feb
(4) |
Mar
(5) |
Apr
(10) |
May
(29) |
Jun
(17) |
Jul
|
Aug
|
Sep
(1) |
Oct
(1) |
Nov
(2) |
Dec
|
2024 |
Jan
|
Feb
(6) |
Mar
|
Apr
(1) |
May
(6) |
Jun
|
Jul
(5) |
Aug
|
Sep
(3) |
Oct
|
Nov
|
Dec
|
2025 |
Jan
|
Feb
(3) |
Mar
|
Apr
|
May
|
Jun
|
Jul
(6) |
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
From: Hoang Le <hoa...@de...> - 2019-11-21 03:01:27
|
When setting up a cluster with non-replicast/replicast capability supported. This capability will be disabled for broadcast send link in order to be backwards compatible. However, when these non-support nodes left and be removed out the cluster. We don't update this capability on broadcast send link. Then, some of features that based on this capability will also disabling as unexpected. In this commit, we make sure the broadcast send link capabilities will be re-calculated as soon as a node removed/rejoined a cluster. Acked-by: Jon Maloy <jon...@er...> Signed-off-by: Hoang Le <hoa...@de...> --- net/tipc/bcast.c | 4 ++-- net/tipc/bcast.h | 2 +- net/tipc/link.c | 2 +- net/tipc/node.c | 8 +++++++- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index f41096a759fa..55aeba681cf4 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -87,9 +87,9 @@ int tipc_bcast_get_mtu(struct net *net) return tipc_link_mss(tipc_bc_sndlink(net)); } -void tipc_bcast_disable_rcast(struct net *net) +void tipc_bcast_toggle_rcast(struct net *net, bool supp) { - tipc_bc_base(net)->rcast_support = false; + tipc_bc_base(net)->rcast_support = supp; } static void tipc_bcbase_calc_bc_threshold(struct net *net) diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index dadad953e2be..9e847d9617d3 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -85,7 +85,7 @@ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_bcl); void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id); void tipc_bcast_dec_bearer_dst_cnt(struct net *net, int bearer_id); int tipc_bcast_get_mtu(struct net *net); -void tipc_bcast_disable_rcast(struct net *net); +void tipc_bcast_toggle_rcast(struct net *net, bool supp); int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts, struct tipc_mc_method *method, struct tipc_nlist *dests, u16 *cong_link_cnt); diff --git a/net/tipc/link.c b/net/tipc/link.c index fb72031228c9..24d4d10756d3 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -550,7 +550,7 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, /* Disable replicast if even a single peer doesn't support it */ if (link_is_bc_rcvlink(l) && !(peer_caps & TIPC_BCAST_RCAST)) - tipc_bcast_disable_rcast(net); + tipc_bcast_toggle_rcast(net, false); return true; } diff --git a/net/tipc/node.c b/net/tipc/node.c index aaf595613e6e..ab04e00cb95b 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -496,6 +496,9 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id, tn->capabilities &= temp_node->capabilities; } + tipc_bcast_toggle_rcast(net, + (tn->capabilities & TIPC_BCAST_RCAST)); + goto exit; } n = kzalloc(sizeof(*n), GFP_ATOMIC); @@ -557,6 +560,7 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id, list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } + tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); trace_tipc_node_create(n, true, " "); exit: spin_unlock_bh(&tn->node_list_lock); @@ -740,7 +744,8 @@ static bool tipc_node_cleanup(struct tipc_node *peer) list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } - + tipc_bcast_toggle_rcast(peer->net, + (tn->capabilities & TIPC_BCAST_RCAST)); spin_unlock_bh(&tn->node_list_lock); return deleted; } @@ -2198,6 +2203,7 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } + tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); err = 0; err_out: tipc_node_put(peer); -- 2.20.1 |
From: Tuong L. <tuo...@de...> - 2019-11-21 02:53:48
|
It is observed that TIPC service binding order will not be kept in the publication event report to user if the service is subscribed after the bindings. For example, services are bound by application in the following order: Server: bound port A to {18888,66,66} scope 2 Server: bound port A to {18888,33,33} scope 2 Now, if a client subscribes to the service range (e.g. {18888, 0-100}), it will get the 'TIPC_PUBLISHED' events in that binding order only when the subscription is started before the bindings. Otherwise, if started after the bindings, the events will arrive in the opposite order: Client: received event for published {18888,33,33} Client: received event for published {18888,66,66} For the latter case, it is clear that the bindings have existed in the name table already, so when reported, the events' order will follow the order of the rbtree binding nodes (- a node with lesser 'lower'/'upper' range value will be first). This is correct as we provide the tracking on a specific service status (available or not), not the relationship between multiple services. However, some users expect to see the same order of arriving events irrespective of when the subscription is issued. This turns out to be easy to fix. We now add functionality to ensure that publication events always are issued in the same temporal order as the corresponding bindings were performed. v2: replace the unnecessary macro - 'publication_after()' with inline function. Acked-by: Jon Maloy <jon...@er...> Signed-off-by: Tuong Lien <tuo...@de...> --- net/tipc/name_table.c | 53 +++++++++++++++++++++++++++++++++++++++++++-------- net/tipc/name_table.h | 4 ++++ 2 files changed, 49 insertions(+), 8 deletions(-) diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 66a65c2cdb23..3a3ff0a7d13b 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -35,6 +35,7 @@ */ #include <net/sock.h> +#include <linux/list_sort.h> #include "core.h" #include "netlink.h" #include "name_table.h" @@ -66,6 +67,7 @@ struct service_range { /** * struct tipc_service - container for all published instances of a service type * @type: 32 bit 'type' value for service + * @publ_cnt: increasing counter for publications in this service * @ranges: rb tree containing all service ranges for this service * @service_list: links to adjacent name ranges in hash chain * @subscriptions: list of subscriptions for this service type @@ -74,6 +76,7 @@ struct service_range { */ struct tipc_service { u32 type; + unsigned int publ_cnt; struct rb_root ranges; struct hlist_node service_list; struct list_head subscriptions; @@ -109,6 +112,7 @@ static struct publication *tipc_publ_create(u32 type, u32 lower, u32 upper, INIT_LIST_HEAD(&publ->binding_node); INIT_LIST_HEAD(&publ->local_publ); INIT_LIST_HEAD(&publ->all_publ); + INIT_LIST_HEAD(&publ->list); return publ; } @@ -244,6 +248,8 @@ static struct publication *tipc_service_insert_publ(struct net *net, p = tipc_publ_create(type, lower, upper, scope, node, port, key); if (!p) goto err; + /* Suppose there shouldn't be a huge gap btw publs i.e. >INT_MAX */ + p->id = sc->publ_cnt++; if (in_own_node(net, node)) list_add(&p->local_publ, &sr->local_publ); list_add(&p->all_publ, &sr->all_publ); @@ -277,6 +283,22 @@ static struct publication *tipc_service_remove_publ(struct service_range *sr, return NULL; } +static inline int publication_after(struct publication *pa, + struct publication *pb) +{ + return ((int)(pb->id - pa->id) < 0); +} + +static int tipc_publ_sort(void *priv, struct list_head *a, + struct list_head *b) +{ + struct publication *pa, *pb; + + pa = container_of(a, struct publication, list); + pb = container_of(b, struct publication, list); + return publication_after(pa, pb); +} + /** * tipc_service_subscribe - attach a subscription, and optionally * issue the prescribed number of events if there is any service @@ -286,36 +308,51 @@ static void tipc_service_subscribe(struct tipc_service *service, struct tipc_subscription *sub) { struct tipc_subscr *sb = &sub->evt.s; + struct publication *p, *first, *tmp; + struct list_head publ_list; struct service_range *sr; struct tipc_name_seq ns; - struct publication *p; struct rb_node *n; - bool first; + u32 filter; ns.type = tipc_sub_read(sb, seq.type); ns.lower = tipc_sub_read(sb, seq.lower); ns.upper = tipc_sub_read(sb, seq.upper); + filter = tipc_sub_read(sb, filter); tipc_sub_get(sub); list_add(&sub->service_list, &service->subscriptions); - if (tipc_sub_read(sb, filter) & TIPC_SUB_NO_STATUS) + if (filter & TIPC_SUB_NO_STATUS) return; + INIT_LIST_HEAD(&publ_list); for (n = rb_first(&service->ranges); n; n = rb_next(n)) { sr = container_of(n, struct service_range, tree_node); if (sr->lower > ns.upper) break; if (!tipc_sub_check_overlap(&ns, sr->lower, sr->upper)) continue; - first = true; + first = NULL; list_for_each_entry(p, &sr->all_publ, all_publ) { - tipc_sub_report_overlap(sub, sr->lower, sr->upper, - TIPC_PUBLISHED, p->port, - p->node, p->scope, first); - first = false; + if (filter & TIPC_SUB_PORTS) + list_add_tail(&p->list, &publ_list); + else if (!first || publication_after(first, p)) + /* Pick this range's *first* publication */ + first = p; } + if (first) + list_add_tail(&first->list, &publ_list); + } + + /* Sort the publications before reporting */ + list_sort(NULL, &publ_list, tipc_publ_sort); + list_for_each_entry_safe(p, tmp, &publ_list, list) { + tipc_sub_report_overlap(sub, p->lower, p->upper, + TIPC_PUBLISHED, p->port, p->node, + p->scope, true); + list_del_init(&p->list); } } diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index f79066334cc8..3d5da71ce41e 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -58,6 +58,7 @@ struct tipc_group; * @node: network address of publishing socket's node * @port: publishing port * @key: publication key, unique across the cluster + * @id: publication id * @binding_node: all publications from the same node which bound this one * - Remote publications: in node->publ_list * Used by node/name distr to withdraw publications when node is lost @@ -69,6 +70,7 @@ struct tipc_group; * Used by closest_first and multicast receive lookup algorithms * @all_publ: all publications identical to this one, whatever node and scope * Used by round-robin lookup algorithm + * @list: to form a list of publications in temporal order * @rcu: RCU callback head used for deferred freeing */ struct publication { @@ -79,10 +81,12 @@ struct publication { u32 node; u32 port; u32 key; + unsigned int id; struct list_head binding_node; struct list_head binding_sock; struct list_head local_publ; struct list_head all_publ; + struct list_head list; struct rcu_head rcu; }; -- 2.13.7 |
From: David M. <da...@da...> - 2019-11-20 20:08:28
|
From: Tuong Lien <tuo...@de...> Date: Wed, 20 Nov 2019 09:15:19 +0700 > @@ -277,6 +283,17 @@ static struct publication *tipc_service_remove_publ(struct service_range *sr, > return NULL; > } > > +#define publication_after(pa, pb) (((int)((pb)->id - (pa)->id) < 0)) We have enough of these things, please use existing interfaces such as time32_after() et al. Thank you. |
From: Jon M. <jon...@er...> - 2019-11-20 17:22:39
|
Hi Ying, See below. > -----Original Message----- > From: Xue, Ying <Yin...@wi...> > Sent: 19-Nov-19 11:47 > To: Jon Maloy <jon...@er...>; Jon Maloy <ma...@do...> > Cc: Mohan Krishna Ghanta Krishnamurthy <moh...@er...>; > par...@gm...; Tung Quang Nguyen <tun...@de...>; Hoang > Huu Le <hoa...@de...>; Tuong Tong Lien <tuo...@de...>; Gordan > Mihaljevic <gor...@de...>; tip...@li... > Subject: RE: [net-next v3 1/1] tipc: introduce variable window congestion control > > Hi Jon, > > I don't know you ever remembered many years ago I ever implemented a prototype to introduce slow- > start and traffic congestion avoidance algorithms on link layer :) > > In my experiment, there were a few of meaningful findings: > - It was crucial for the throughput performance about how to set initial congestion window and upper > congestion window size. > - It was found that different Ethernet speeds, different CPU capabilities and different message sizes all have > a big impact on throughput performance. I did lots of experiments, as a result, sometimes performance > improvement was very obvious, but sometimes, performance improvement was minimal even when I > measured throughput in a similar network environment. Particularly, if I slightly changed some test > conditions, throughput improvement results were also quite different. > > At that moment, I ever doubted whether we needed to do the following changes: > 1. Should we introduce a timer to measure RTT and identify whether network congestion happens or not? Actually, measuring RTT is one of the stated requirements from the Ericsson product line, and I think we should do it. Our plan was to do this by adding an adequately scaled time stamp to probe/probe-reply messages. This in turn could serve as base to calculate the bandwidth product and window limits. > 2. Should we change message delivery mode from message-oriented to byte-oriented? In my original series, sent out Nov. 4th, I did that in an additional patch. However, the solution was intrusive and ugly, and even gave slightly poorer performance than the current packet base one. So I decided to abandon this approach, at least for now. > > Of course, in my experiment I didn't make so big changes. > > So I want to know: > - How did you select the minimum window size and maximum window size? This was done empirically only. Lower than 50 never made any sense, as throughput always would be lower. Throughput seemed to improve up to 500, but thereafter there was no improvement. > - Did you measure TIPC throughput performance on different Ethernets? Including large message size test > and small message size test. I measured it only on virtio. I know this is a weakness, and we should of course try on other drivers as well. > - Did you meet similar phenomena to me when we slightly changed test condition? I observed differences, but the overall picture was that this modest change always gave some improvement. > > In this proposal, during slow-start stage, the window increase is pretty slow: > > + if (qlen >= cwin && (l->snd_nxt - buf_seqno(skb_peek(txq)) == qlen)) { > + add = l->cong_acks++ % 32 ? 0 : 1; > + cwin = min_t(u16, cwin + add, l->max_win); > + l->window = cwin; > + } > > But in TCP slow-start algorithm, during slow-start stage congestion window increase is much more > aggressive than above. As long as congestion window exceeds slow-start threshold, it enters congestion > avoidance stage in which congestion window increases slowly. > > I am curious why the congestion window increase is pretty conservative compared to TCP slow-start > algorithm. What factors did you consider when you selected the algorithm? I did it only because it is simple and gives an obvious improvement. I am fully aware that the growth is very slow, and that introducing a threshold where we go from a faster growth to a slower one probably would increase average window size and throughput. I see this as the next step, that we all could experiment with. However, my approach now is that this very simple change is a low hanging fruit that gives an obvious improvement, so we lose nothing by releasing it as is. Then we can make it subject to further improvements later. BR ///jon > > Thanks, > Ying > > -----Original Message----- > From: Jon Maloy [mailto:jon...@er...] > Sent: Tuesday, November 19, 2019 7:33 AM > To: Jon Maloy; Jon Maloy > Cc: moh...@er...; par...@gm...; > tun...@de...; hoa...@de...; tuo...@de...; > gor...@de...; Xue, Ying; tip...@li... > Subject: [net-next v3 1/1] tipc: introduce variable window congestion control > > We introduce a simple variable window congestion control for links. > The algorithm is inspired by the Reno algorithm, and can best be > descibed as working in permanent "congestion avoidance" mode, within > strict limits. > > - We introduce hard lower and upper window limits per link, still > different and configurable per bearer type. > > - Next, we let a link start at the minimum window, and then slowly > increment it for each 32 received non-duplicate ACK. This goes on > until it either reaches the upper limit, or until it receives a > NACK message. > > - For each non-duplicate NACK received, we let the window decrease by > intervals of 1/2 of the current window, but not below the minimum > window. > > The change does in reality have effect only on unicast ethernet > transport, as we have seen that there is no room whatsoever for > increasing the window max size for the UDP bearer. > For now, we also choose to keep the limits for the broadcast link > unchanged and equal. > > This algorithm seems to give a ~25% throughput improvement for large > messages, while it has no effect on throughput for small messages. > > Suggested-by: Xin Long <luc...@gm...> > Acked-by: Xin Long <luc...@gm...> > Signed-off-by: Jon Maloy <jon...@er...> > > --- > v2: - Moved window increment in tipc_advance_backlogq() to before > the transfer loop, as suggested Tuong. > - Introduced logic for incrementing the window even for the > broadcast send link, also suggested by Tuong. > v3: - Rebased to latest net-next > --- > net/tipc/bcast.c | 11 ++++---- > net/tipc/bearer.c | 11 ++++---- > net/tipc/bearer.h | 6 +++-- > net/tipc/eth_media.c | 6 ++++- > net/tipc/ib_media.c | 5 +++- > net/tipc/link.c | 76 ++++++++++++++++++++++++++++++++++------------------ > net/tipc/link.h | 9 ++++--- > net/tipc/node.c | 13 +++++---- > net/tipc/udp_media.c | 3 ++- > 9 files changed, 90 insertions(+), 50 deletions(-) > > diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c > index f41096a..84da317 100644 > --- a/net/tipc/bcast.c > +++ b/net/tipc/bcast.c > @@ -562,18 +562,18 @@ int tipc_bclink_reset_stats(struct net *net) > return 0; > } > > -static int tipc_bc_link_set_queue_limits(struct net *net, u32 limit) > +static int tipc_bc_link_set_queue_limits(struct net *net, u32 max_win) > { > struct tipc_link *l = tipc_bc_sndlink(net); > > if (!l) > return -ENOPROTOOPT; > - if (limit < BCLINK_WIN_MIN) > - limit = BCLINK_WIN_MIN; > - if (limit > TIPC_MAX_LINK_WIN) > + if (max_win < BCLINK_WIN_MIN) > + max_win = BCLINK_WIN_MIN; > + if (max_win > TIPC_MAX_LINK_WIN) > return -EINVAL; > tipc_bcast_lock(net); > - tipc_link_set_queue_limits(l, limit); > + tipc_link_set_queue_limits(l, BCLINK_WIN_MIN, max_win); > tipc_bcast_unlock(net); > return 0; > } > @@ -683,6 +683,7 @@ int tipc_bcast_init(struct net *net) > if (!tipc_link_bc_create(net, 0, 0, > FB_MTU, > BCLINK_WIN_DEFAULT, > + BCLINK_WIN_DEFAULT, > 0, > &bb->inputq, > NULL, > diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c > index d7ec26b..34ca7b7 100644 > --- a/net/tipc/bearer.c > +++ b/net/tipc/bearer.c > @@ -311,7 +311,8 @@ static int tipc_enable_bearer(struct net *net, const char *name, > > b->identity = bearer_id; > b->tolerance = m->tolerance; > - b->window = m->window; > + b->min_win = m->min_win; > + b->max_win = m->max_win; > b->domain = disc_domain; > b->net_plane = bearer_id + 'A'; > b->priority = prio; > @@ -796,7 +797,7 @@ static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg, > goto prop_msg_full; > if (nla_put_u32(msg->skb, TIPC_NLA_PROP_TOL, bearer->tolerance)) > goto prop_msg_full; > - if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bearer->window)) > + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bearer->max_win)) > goto prop_msg_full; > if (bearer->media->type_id == TIPC_MEDIA_TYPE_UDP) > if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, bearer->mtu)) > @@ -1088,7 +1089,7 @@ int __tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) > if (props[TIPC_NLA_PROP_PRIO]) > b->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); > if (props[TIPC_NLA_PROP_WIN]) > - b->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]); > + b->max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); > if (props[TIPC_NLA_PROP_MTU]) { > if (b->media->type_id != TIPC_MEDIA_TYPE_UDP) > return -EINVAL; > @@ -1142,7 +1143,7 @@ static int __tipc_nl_add_media(struct tipc_nl_msg *msg, > goto prop_msg_full; > if (nla_put_u32(msg->skb, TIPC_NLA_PROP_TOL, media->tolerance)) > goto prop_msg_full; > - if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, media->window)) > + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, media->max_win)) > goto prop_msg_full; > if (media->type_id == TIPC_MEDIA_TYPE_UDP) > if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, media->mtu)) > @@ -1275,7 +1276,7 @@ int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info) > if (props[TIPC_NLA_PROP_PRIO]) > m->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); > if (props[TIPC_NLA_PROP_WIN]) > - m->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]); > + m->max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); > if (props[TIPC_NLA_PROP_MTU]) { > if (m->type_id != TIPC_MEDIA_TYPE_UDP) > return -EINVAL; > diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h > index d0c79cc..bc00231 100644 > --- a/net/tipc/bearer.h > +++ b/net/tipc/bearer.h > @@ -119,7 +119,8 @@ struct tipc_media { > char *raw); > u32 priority; > u32 tolerance; > - u32 window; > + u32 min_win; > + u32 max_win; > u32 mtu; > u32 type_id; > u32 hwaddr_len; > @@ -158,7 +159,8 @@ struct tipc_bearer { > struct packet_type pt; > struct rcu_head rcu; > u32 priority; > - u32 window; > + u32 min_win; > + u32 max_win; > u32 tolerance; > u32 domain; > u32 identity; > diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c > index f69a2fd..38cdcab 100644 > --- a/net/tipc/eth_media.c > +++ b/net/tipc/eth_media.c > @@ -37,6 +37,9 @@ > #include "core.h" > #include "bearer.h" > > +#define TIPC_MIN_ETH_LINK_WIN 50 > +#define TIPC_MAX_ETH_LINK_WIN 500 > + > /* Convert Ethernet address (media address format) to string */ > static int tipc_eth_addr2str(struct tipc_media_addr *addr, > char *strbuf, int bufsz) > @@ -92,7 +95,8 @@ struct tipc_media eth_media_info = { > .raw2addr = tipc_eth_raw2addr, > .priority = TIPC_DEF_LINK_PRI, > .tolerance = TIPC_DEF_LINK_TOL, > - .window = TIPC_DEF_LINK_WIN, > + .min_win = TIPC_MIN_ETH_LINK_WIN, > + .max_win = TIPC_MAX_ETH_LINK_WIN, > .type_id = TIPC_MEDIA_TYPE_ETH, > .hwaddr_len = ETH_ALEN, > .name = "eth" > diff --git a/net/tipc/ib_media.c b/net/tipc/ib_media.c > index e8c1671..7aa9ff8 100644 > --- a/net/tipc/ib_media.c > +++ b/net/tipc/ib_media.c > @@ -42,6 +42,8 @@ > #include "core.h" > #include "bearer.h" > > +#define TIPC_MAX_IB_LINK_WIN 500 > + > /* convert InfiniBand address (media address format) media address to string */ > static int tipc_ib_addr2str(struct tipc_media_addr *a, char *str_buf, > int str_size) > @@ -94,7 +96,8 @@ struct tipc_media ib_media_info = { > .raw2addr = tipc_ib_raw2addr, > .priority = TIPC_DEF_LINK_PRI, > .tolerance = TIPC_DEF_LINK_TOL, > - .window = TIPC_DEF_LINK_WIN, > + .min_win = TIPC_DEF_LINK_WIN, > + .max_win = TIPC_MAX_IB_LINK_WIN, > .type_id = TIPC_MEDIA_TYPE_IB, > .hwaddr_len = INFINIBAND_ALEN, > .name = "ib" > diff --git a/net/tipc/link.c b/net/tipc/link.c > index fb72031..a88950b 100644 > --- a/net/tipc/link.c > +++ b/net/tipc/link.c > @@ -164,7 +164,6 @@ struct tipc_link { > struct sk_buff *target_bskb; > } backlog[5]; > u16 snd_nxt; > - u16 window; > > /* Reception */ > u16 rcv_nxt; > @@ -175,6 +174,10 @@ struct tipc_link { > > /* Congestion handling */ > struct sk_buff_head wakeupq; > + u16 window; > + u16 min_win; > + u16 max_win; > + u16 cong_acks; > > /* Fragmentation/reassembly */ > struct sk_buff *reasm_buf; > @@ -308,9 +311,14 @@ u32 tipc_link_id(struct tipc_link *l) > return l->peer_bearer_id << 16 | l->bearer_id; > } > > -int tipc_link_window(struct tipc_link *l) > +int tipc_link_min_win(struct tipc_link *l) > +{ > + return l->min_win; > +} > + > +int tipc_link_max_win(struct tipc_link *l) > { > - return l->window; > + return l->max_win; > } > > int tipc_link_prio(struct tipc_link *l) > @@ -436,7 +444,8 @@ u32 tipc_link_state(struct tipc_link *l) > * @net_plane: network plane (A,B,c..) this link belongs to > * @mtu: mtu to be advertised by link > * @priority: priority to be used by link > - * @window: send window to be used by link > + * @min_win: minimal send window to be used by link > + * @max_win: maximal send window to be used by link > * @session: session to be used by link > * @ownnode: identity of own node > * @peer: node id of peer node > @@ -451,7 +460,7 @@ u32 tipc_link_state(struct tipc_link *l) > */ > bool tipc_link_create(struct net *net, char *if_name, int bearer_id, > int tolerance, char net_plane, u32 mtu, int priority, > - int window, u32 session, u32 self, > + u32 min_win, u32 max_win, u32 session, u32 self, > u32 peer, u8 *peer_id, u16 peer_caps, > struct tipc_link *bc_sndlink, > struct tipc_link *bc_rcvlink, > @@ -495,7 +504,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, > l->advertised_mtu = mtu; > l->mtu = mtu; > l->priority = priority; > - tipc_link_set_queue_limits(l, window); > + tipc_link_set_queue_limits(l, min_win, max_win); > l->ackers = 1; > l->bc_sndlink = bc_sndlink; > l->bc_rcvlink = bc_rcvlink; > @@ -523,7 +532,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, > * Returns true if link was created, otherwise false > */ > bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, > - int mtu, int window, u16 peer_caps, > + int mtu, u32 min_win, u32 max_win, u16 peer_caps, > struct sk_buff_head *inputq, > struct sk_buff_head *namedq, > struct tipc_link *bc_sndlink, > @@ -531,9 +540,9 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, > { > struct tipc_link *l; > > - if (!tipc_link_create(net, "", MAX_BEARERS, 0, 'Z', mtu, 0, window, > - 0, ownnode, peer, NULL, peer_caps, bc_sndlink, > - NULL, inputq, namedq, link)) > + if (!tipc_link_create(net, "", MAX_BEARERS, 0, 'Z', mtu, 0, min_win, > + max_win, 0, ownnode, peer, NULL, peer_caps, > + bc_sndlink, NULL, inputq, namedq, link)) > return false; > > l = *link; > @@ -959,7 +968,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, > int pkt_cnt = skb_queue_len(list); > int imp = msg_importance(hdr); > unsigned int mss = tipc_link_mss(l); > - unsigned int maxwin = l->window; > + unsigned int cwin = l->window; > unsigned int mtu = l->mtu; > bool new_bundle; > int rc = 0; > @@ -988,7 +997,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, > > /* Prepare each packet for sending, and add to relevant queue: */ > while ((skb = __skb_dequeue(list))) { > - if (likely(skb_queue_len(transmq) < maxwin)) { > + if (likely(skb_queue_len(transmq) < cwin)) { > hdr = buf_msg(skb); > msg_set_seqno(hdr, seqno); > msg_set_ack(hdr, ack); > @@ -1038,6 +1047,8 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, > static void tipc_link_advance_backlog(struct tipc_link *l, > struct sk_buff_head *xmitq) > { > + struct sk_buff_head *txq = &l->transmq; > + u16 qlen, add, cwin = l->window; > struct sk_buff *skb, *_skb; > struct tipc_msg *hdr; > u16 seqno = l->snd_nxt; > @@ -1045,7 +1056,13 @@ static void tipc_link_advance_backlog(struct tipc_link *l, > u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; > u32 imp; > > - while (skb_queue_len(&l->transmq) < l->window) { > + qlen = skb_queue_len(txq); > + if (qlen >= cwin && (l->snd_nxt - buf_seqno(skb_peek(txq)) == qlen)) { > + add = l->cong_acks++ % 32 ? 0 : 1; > + cwin = min_t(u16, cwin + add, l->max_win); > + l->window = cwin; > + } > + while (skb_queue_len(txq) < cwin) { > skb = skb_peek(&l->backlogq); > if (!skb) > break; > @@ -1140,6 +1157,7 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, > { > struct sk_buff *_skb, *skb = skb_peek(&l->transmq); > u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; > + bool retransmitted = false; > u16 ack = l->rcv_nxt - 1; > struct tipc_msg *hdr; > int rc = 0; > @@ -1173,11 +1191,13 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, > _skb->priority = TC_PRIO_CONTROL; > __skb_queue_tail(xmitq, _skb); > l->stats.retransmitted++; > - > + retransmitted = true; > /* Increase actual retrans counter & mark first time */ > if (!TIPC_SKB_CB(skb)->retr_cnt++) > TIPC_SKB_CB(skb)->retr_stamp = jiffies; > } > + if (retransmitted) > + l->window = l->min_win + (l->window - l->min_win) / 2; > return 0; > } > > @@ -1417,6 +1437,7 @@ static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, > struct sk_buff *skb, *_skb, *tmp; > struct tipc_msg *hdr; > u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; > + bool retransmitted = false; > u16 ack = l->rcv_nxt - 1; > bool passed = false; > u16 seqno, n = 0; > @@ -1449,7 +1470,7 @@ static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, > _skb->priority = TC_PRIO_CONTROL; > __skb_queue_tail(xmitq, _skb); > l->stats.retransmitted++; > - > + retransmitted = true; > /* Increase actual retrans counter & mark first time */ > if (!TIPC_SKB_CB(skb)->retr_cnt++) > TIPC_SKB_CB(skb)->retr_stamp = jiffies; > @@ -1463,7 +1484,8 @@ static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, > goto next_gap_ack; > } > } > - > + if (retransmitted) > + l->window = l->min_win + (l->window - l->min_win) / 2; > return 0; > } > > @@ -2305,15 +2327,17 @@ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb, > return 0; > } > > -void tipc_link_set_queue_limits(struct tipc_link *l, u32 win) > +void tipc_link_set_queue_limits(struct tipc_link *l, u32 min_win, u32 max_win) > { > int max_bulk = TIPC_MAX_PUBL / (l->mtu / ITEM_SIZE); > > - l->window = win; > - l->backlog[TIPC_LOW_IMPORTANCE].limit = max_t(u16, 50, win); > - l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = max_t(u16, 100, win * 2); > - l->backlog[TIPC_HIGH_IMPORTANCE].limit = max_t(u16, 150, win * 3); > - l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = max_t(u16, 200, win * 4); > + l->window = min_win; > + l->min_win = min_win; > + l->max_win = max_win; > + l->backlog[TIPC_LOW_IMPORTANCE].limit = min_win * 2; > + l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = min_win * 4; > + l->backlog[TIPC_HIGH_IMPORTANCE].limit = min_win * 6; > + l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = min_win * 8; > l->backlog[TIPC_SYSTEM_IMPORTANCE].limit = max_bulk; > } > > @@ -2366,10 +2390,10 @@ int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]) > } > > if (props[TIPC_NLA_PROP_WIN]) { > - u32 win; > + u32 max_win; > > - win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); > - if ((win < TIPC_MIN_LINK_WIN) || (win > TIPC_MAX_LINK_WIN)) > + max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); > + if (max_win < TIPC_MIN_LINK_WIN || max_win > TIPC_MAX_LINK_WIN) > return -EINVAL; > } > > @@ -2605,7 +2629,7 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg) > prop = nla_nest_start_noflag(msg->skb, TIPC_NLA_LINK_PROP); > if (!prop) > goto attr_msg_full; > - if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->window)) > + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->max_win)) > goto prop_msg_full; > if (nla_put_u32(msg->skb, TIPC_NLA_PROP_BROADCAST, bc_mode)) > goto prop_msg_full; > diff --git a/net/tipc/link.h b/net/tipc/link.h > index c09e9d4..d3c1c3f 100644 > --- a/net/tipc/link.h > +++ b/net/tipc/link.h > @@ -73,7 +73,7 @@ enum { > > bool tipc_link_create(struct net *net, char *if_name, int bearer_id, > int tolerance, char net_plane, u32 mtu, int priority, > - int window, u32 session, u32 ownnode, > + u32 min_win, u32 max_win, u32 session, u32 ownnode, > u32 peer, u8 *peer_id, u16 peer_caps, > struct tipc_link *bc_sndlink, > struct tipc_link *bc_rcvlink, > @@ -81,7 +81,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, > struct sk_buff_head *namedq, > struct tipc_link **link); > bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, > - int mtu, int window, u16 peer_caps, > + int mtu, u32 min_win, u32 max_win, u16 peer_caps, > struct sk_buff_head *inputq, > struct sk_buff_head *namedq, > struct tipc_link *bc_sndlink, > @@ -115,7 +115,8 @@ char *tipc_link_name_ext(struct tipc_link *l, char *buf); > u32 tipc_link_state(struct tipc_link *l); > char tipc_link_plane(struct tipc_link *l); > int tipc_link_prio(struct tipc_link *l); > -int tipc_link_window(struct tipc_link *l); > +int tipc_link_min_win(struct tipc_link *l); > +int tipc_link_max_win(struct tipc_link *l); > void tipc_link_update_caps(struct tipc_link *l, u16 capabilities); > bool tipc_link_validate_msg(struct tipc_link *l, struct tipc_msg *hdr); > unsigned long tipc_link_tolerance(struct tipc_link *l); > @@ -124,7 +125,7 @@ void tipc_link_set_tolerance(struct tipc_link *l, u32 tol, > void tipc_link_set_prio(struct tipc_link *l, u32 prio, > struct sk_buff_head *xmitq); > void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit); > -void tipc_link_set_queue_limits(struct tipc_link *l, u32 window); > +void tipc_link_set_queue_limits(struct tipc_link *l, u32 min_win, u32 max_win); > int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, > struct tipc_link *link, int nlflags); > int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]); > diff --git a/net/tipc/node.c b/net/tipc/node.c > index aaf5956..970c780 100644 > --- a/net/tipc/node.c > +++ b/net/tipc/node.c > @@ -1134,7 +1134,8 @@ void tipc_node_check_dest(struct net *net, u32 addr, > snd_l = tipc_bc_sndlink(net); > if (!tipc_link_bc_create(net, tipc_own_addr(net), > addr, U16_MAX, > - tipc_link_window(snd_l), > + tipc_link_min_win(snd_l), > + tipc_link_max_win(snd_l), > n->capabilities, > &n->bc_entry.inputq1, > &n->bc_entry.namedq, snd_l, > @@ -1228,7 +1229,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, > get_random_bytes(&session, sizeof(u16)); > if (!tipc_link_create(net, if_name, b->identity, b->tolerance, > b->net_plane, b->mtu, b->priority, > - b->window, session, > + b->min_win, b->max_win, session, > tipc_own_addr(net), addr, peer_id, > n->capabilities, > tipc_bc_sndlink(n->net), n->bc_entry.link, > @@ -2374,10 +2375,12 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) > tipc_link_set_prio(link, prio, &xmitq); > } > if (props[TIPC_NLA_PROP_WIN]) { > - u32 win; > + u32 max_win; > > - win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); > - tipc_link_set_queue_limits(link, win); > + max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); > + tipc_link_set_queue_limits(link, > + tipc_link_min_win(link), > + max_win); > } > } > > diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c > index 86aaa4d..7d9dcbb 100644 > --- a/net/tipc/udp_media.c > +++ b/net/tipc/udp_media.c > @@ -825,7 +825,8 @@ struct tipc_media udp_media_info = { > .msg2addr = tipc_udp_msg2addr, > .priority = TIPC_DEF_LINK_PRI, > .tolerance = TIPC_DEF_LINK_TOL, > - .window = TIPC_DEF_LINK_WIN, > + .min_win = TIPC_DEF_LINK_WIN, > + .max_win = TIPC_DEF_LINK_WIN, > .mtu = TIPC_DEF_LINK_UDP_MTU, > .type_id = TIPC_MEDIA_TYPE_UDP, > .hwaddr_len = 0, > -- > 2.1.4 |
From: Jon M. <jon...@er...> - 2019-11-20 17:06:27
|
Acked-by: Jon Maloy <jon...@er...> > -----Original Message----- > From: Tuong Lien Tong <tuo...@de...> > Sent: 19-Nov-19 22:37 > To: Jon Maloy <jon...@er...>; 'Ying Xue' <yin...@wi...>; tipc- > dis...@li...; ma...@do... > Subject: RE: [iproute2] tipc: add new commands to set TIPC AEAD key > > Hi Jon/Ying, > > We still have this patch (i.e. for the 'tipc node set/flush key...' commands), may I put your ACK on it before > sending to iproute2-next? > Many thanks! > > BR/Tuong > > -----Original Message----- > From: Jon Maloy <jon...@er...> > Sent: Wednesday, October 16, 2019 9:51 PM > To: Ying Xue <yin...@wi...>; Tuong Tong Lien <tuo...@de...>; tipc- > dis...@li...; ma...@do... > Subject: RE: [iproute2] tipc: add new commands to set TIPC AEAD key > > > > > -----Original Message----- > > From: Ying Xue <yin...@wi...> > > Sent: 16-Oct-19 08:30 > > To: Tuong Tong Lien <tuo...@de...>; tipc- > > dis...@li...; Jon Maloy <jon...@er...>; > > ma...@do... > > Subject: Re: [iproute2] tipc: add new commands to set TIPC AEAD key > > > > Tt looks like we will use "tipc node" command to configure static key to TIPC > > module, right? > > The key is static in the sense that TIPC itself cannot change the key. But the protocol ensures that keys can > be replaced without any traffic disturbances. > > > > > Do we plan to support dynamic key setting? If yes, what kinds of key exchange > > protocol would we use? For example, in IPSEC, it uses IKEv2 as its key > > exchange protocol. > > At the moment we assume there is an external user land framework where node authentication is done > and where keys are generated and distributed (via TLS) to the nodes. > When we want to replace a key (probably at fix pre-defined intervals), the framework has to generate new > keys and distribute/inject those to TIPC. > > > > > Will key be expired after a specific lifetime? For instance, in > > IPSEC/Raccoon2 or strongswan, they use rekey feature to provide this > > function to make security association safer. > > We are considering this, so that the external framework can be kept simpler or even be eliminated. That > would be the next step, once this series is applied. > > Regards > ///jon > > > > > > On 10/14/19 7:36 PM, Tuong Lien wrote: > > > Two new commands are added as part of 'tipc node' command: > > > > > > $tipc node set key KEY [algname ALGNAME] [nodeid NODEID] $tipc node > > > flush key > > > > > > which enable user to set and remove AEAD keys in kernel TIPC. > > > > > > For the 'set key' command, the given 'nodeid' parameter decides the > > > mode to be applied to the key, particularly: > > > > > > - If NODEID is empty, the key is a 'cluster' key which will be used > > > for all message encryption/decryption from/to the node (i.e. both TX & RX). > > > The same key needs to be set in the other nodes i.e. the 'cluster key' > > > mode. > > > > > > - If NODEID is own node, the key is used for message encryption (TX) > > > from the node. Whereas, if NODEID is a peer node, the key is for > > > message decryption (RX) from that peer node. > > > This is the 'per-node-key' mode that each nodes in the cluster has its > > > specific (TX) key. > > > > > > Signed-off-by: Tuong Lien <tuo...@de...> > > > --- > > > include/uapi/linux/tipc.h | 21 ++++++ > > > include/uapi/linux/tipc_netlink.h | 4 ++ > > > tipc/misc.c | 38 +++++++++++ > > > tipc/misc.h | 1 + > > > tipc/node.c | 133 > > +++++++++++++++++++++++++++++++++++++- > > > 5 files changed, 195 insertions(+), 2 deletions(-) > > > > > > diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h > > > index e16cb4e2..b118ce9b 100644 > > > --- a/include/uapi/linux/tipc.h > > > +++ b/include/uapi/linux/tipc.h > > > @@ -232,6 +232,27 @@ struct tipc_sioc_nodeid_req { > > > char node_id[TIPC_NODEID_LEN]; > > > }; > > > > > > +/* > > > + * TIPC Crypto, AEAD mode > > > + */ > > > +#define TIPC_AEAD_MAX_ALG_NAME (32) > > > +#define TIPC_AEAD_MIN_KEYLEN (16 + 4) > > > +#define TIPC_AEAD_MAX_KEYLEN (32 + 4) > > > + > > > +struct tipc_aead_key { > > > + char alg_name[TIPC_AEAD_MAX_ALG_NAME]; > > > + unsigned int keylen; /* in bytes */ > > > + char key[]; > > > +}; > > > + > > > +#define TIPC_AEAD_KEY_MAX_SIZE (sizeof(struct tipc_aead_key) + \ > > > + TIPC_AEAD_MAX_KEYLEN) > > > + > > > +static inline int tipc_aead_key_size(struct tipc_aead_key *key) { > > > + return sizeof(*key) + key->keylen; > > > +} > > > + > > > /* The macros and functions below are deprecated: > > > */ > > > > > > diff --git a/include/uapi/linux/tipc_netlink.h > > > b/include/uapi/linux/tipc_netlink.h > > > index efb958fd..6c2194ab 100644 > > > --- a/include/uapi/linux/tipc_netlink.h > > > +++ b/include/uapi/linux/tipc_netlink.h > > > @@ -63,6 +63,8 @@ enum { > > > TIPC_NL_PEER_REMOVE, > > > TIPC_NL_BEARER_ADD, > > > TIPC_NL_UDP_GET_REMOTEIP, > > > + TIPC_NL_KEY_SET, > > > + TIPC_NL_KEY_FLUSH, > > > > > > __TIPC_NL_CMD_MAX, > > > TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 @@ -160,6 +162,8 > > @@ enum { > > > TIPC_NLA_NODE_UNSPEC, > > > TIPC_NLA_NODE_ADDR, /* u32 */ > > > TIPC_NLA_NODE_UP, /* flag */ > > > + TIPC_NLA_NODE_ID, /* data */ > > > + TIPC_NLA_NODE_KEY, /* data */ > > > > > > __TIPC_NLA_NODE_MAX, > > > TIPC_NLA_NODE_MAX = __TIPC_NLA_NODE_MAX - 1 diff --git > > a/tipc/misc.c > > > b/tipc/misc.c index e4b1cd0c..1daf3072 100644 > > > --- a/tipc/misc.c > > > +++ b/tipc/misc.c > > > @@ -98,6 +98,44 @@ int str2nodeid(char *str, uint8_t *id) > > > return 0; > > > } > > > > > > +int str2key(char *str, struct tipc_aead_key *key) { > > > + int len = strlen(str); > > > + int ishex = 0; > > > + int i; > > > + > > > + /* Check if the input is a hex string (i.e. 0x...) */ > > > + if (len > 2 && strncmp(str, "0x", 2) == 0) { > > > + ishex = is_hex(str + 2, len - 2 - 1); > > > + if (ishex) { > > > + len -= 2; > > > + str += 2; > > > + } > > > + } > > > + > > > + /* Obtain key: */ > > > + if (!ishex) { > > > + key->keylen = len; > > > + memcpy(key->key, str, len); > > > + } else { > > > + /* Convert hex string to key */ > > > + key->keylen = (len + 1) / 2; > > > + for (i = 0; i < key->keylen; i++) { > > > + if (i == 0 && len % 2 != 0) { > > > + if (sscanf(str, "%1hhx", &key->key[0]) != 1) > > > + return -1; > > > + str += 1; > > > + continue; > > > + } > > > + if (sscanf(str, "%2hhx", &key->key[i]) != 1) > > > + return -1; > > > + str += 2; > > > + } > > > + } > > > + > > > + return 0; > > > +} > > > + > > > void nodeid2str(uint8_t *id, char *str) { > > > int i; > > > diff --git a/tipc/misc.h b/tipc/misc.h index ff2f31f1..59309f68 100644 > > > --- a/tipc/misc.h > > > +++ b/tipc/misc.h > > > @@ -18,5 +18,6 @@ uint32_t str2addr(char *str); int str2nodeid(char > > > *str, uint8_t *id); void nodeid2str(uint8_t *id, char *str); void > > > hash2nodestr(uint32_t hash, char *str); > > > +int str2key(char *str, struct tipc_aead_key *key); > > > > > > #endif > > > diff --git a/tipc/node.c b/tipc/node.c index 2fec6753..fc81bd30 100644 > > > --- a/tipc/node.c > > > +++ b/tipc/node.c > > > @@ -157,6 +157,111 @@ static int cmd_node_set_nodeid(struct nlmsghdr > > *nlh, const struct cmd *cmd, > > > return msg_doit(nlh, NULL, NULL); > > > } > > > > > > +static void cmd_node_set_key_help(struct cmdl *cmdl) { > > > + fprintf(stderr, > > > + "Usage: %s node set key KEY [algname ALGNAME] [nodeid > > NODEID]\n\n" > > > + "PROPERTIES\n" > > > + " KEY - Symmetric KEY & SALT as a normal or hex > > string\n" > > > + " that consists of two parts:\n" > > > + " [KEY: 16, 24 or 32 octets][SALT: 4 octets]\n\n" > > > + " algname ALGNAME - Default: \"gcm(aes)\"\n\n" > > > + " nodeid NODEID - Own or peer node identity to which the > > key will\n" > > > + " be attached. If not present, the key is a cluster\n" > > > + " key!\n\n" > > > + "EXAMPLES\n" > > > + " %s node set key this_is_a_key16_salt algname \"gcm(aes)\" > > nodeid node1\n" > > > + " %s node set key > > 0x746869735F69735F615F6B657931365F73616C74 nodeid node2\n\n", > > > + cmdl->argv[0], cmdl->argv[0], cmdl->argv[0]); } > > > + > > > +static int cmd_node_set_key(struct nlmsghdr *nlh, const struct cmd *cmd, > > > + struct cmdl *cmdl, void *data) { > > > + struct { > > > + struct tipc_aead_key key; > > > + char mem[TIPC_AEAD_MAX_KEYLEN + 1]; > > > + } input = {}; > > > + struct opt opts[] = { > > > + { "algname", OPT_KEYVAL, NULL }, > > > + { "nodeid", OPT_KEYVAL, NULL }, > > > + { NULL } > > > + }; > > > + struct nlattr *nest; > > > + struct opt *opt_algname, *opt_nodeid; > > > + char buf[MNL_SOCKET_BUFFER_SIZE]; > > > + uint8_t id[TIPC_NODEID_LEN] = {0,}; > > > + int keysize; > > > + char *str; > > > + > > > + if (help_flag) { > > > + (cmd->help)(cmdl); > > > + return -EINVAL; > > > + } > > > + > > > + if (cmdl->optind >= cmdl->argc) { > > > + fprintf(stderr, "error, missing key\n"); > > > + return -EINVAL; > > > + } > > > + > > > + /* Get user key */ > > > + str = shift_cmdl(cmdl); > > > + if (str2key(str, &input.key)) { > > > + fprintf(stderr, "error, invalid key input\n"); > > > + return -EINVAL; > > > + } > > > + > > > + if (parse_opts(opts, cmdl) < 0) > > > + return -EINVAL; > > > + > > > + /* Get algorithm name, default: "gcm(aes)" */ > > > + opt_algname = get_opt(opts, "algname"); > > > + if (!opt_algname) > > > + strcpy(input.key.alg_name, "gcm(aes)"); > > > + else > > > + strcpy(input.key.alg_name, opt_algname->val); > > > + > > > + /* Get node identity */ > > > + opt_nodeid = get_opt(opts, "nodeid"); > > > + if (opt_nodeid && str2nodeid(opt_nodeid->val, id)) { > > > + fprintf(stderr, "error, invalid node identity\n"); > > > + return -EINVAL; > > > + } > > > + > > > + /* Init & do the command */ > > > + nlh = msg_init(buf, TIPC_NL_KEY_SET); > > > + if (!nlh) { > > > + fprintf(stderr, "error, message initialisation failed\n"); > > > + return -1; > > > + } > > > + nest = mnl_attr_nest_start(nlh, TIPC_NLA_NODE); > > > + keysize = tipc_aead_key_size(&input.key); > > > + mnl_attr_put(nlh, TIPC_NLA_NODE_KEY, keysize, &input.key); > > > + if (opt_nodeid) > > > + mnl_attr_put(nlh, TIPC_NLA_NODE_ID, TIPC_NODEID_LEN, id); > > > + mnl_attr_nest_end(nlh, nest); > > > + return msg_doit(nlh, NULL, NULL); > > > +} > > > + > > > +static int cmd_node_flush_key(struct nlmsghdr *nlh, const struct cmd > > *cmd, > > > + struct cmdl *cmdl, void *data) { > > > + char buf[MNL_SOCKET_BUFFER_SIZE]; > > > + > > > + if (help_flag) { > > > + (cmd->help)(cmdl); > > > + return -EINVAL; > > > + } > > > + > > > + /* Init & do the command */ > > > + nlh = msg_init(buf, TIPC_NL_KEY_FLUSH); > > > + if (!nlh) { > > > + fprintf(stderr, "error, message initialisation failed\n"); > > > + return -1; > > > + } > > > + return msg_doit(nlh, NULL, NULL); > > > +} > > > + > > > static int nodeid_get_cb(const struct nlmsghdr *nlh, void *data) { > > > struct nlattr *info[TIPC_NLA_MAX + 1] = {}; @@ -270,13 +375,34 @@ > > > static int cmd_node_set_netid(struct nlmsghdr *nlh, const struct cmd *cmd, > > > return msg_doit(nlh, NULL, NULL); > > > } > > > > > > +static void cmd_node_flush_help(struct cmdl *cmdl) { > > > + fprintf(stderr, > > > + "Usage: %s node flush PROPERTY\n\n" > > > + "PROPERTIES\n" > > > + " key - Flush all symmetric-keys\n", > > > + cmdl->argv[0]); > > > +} > > > + > > > +static int cmd_node_flush(struct nlmsghdr *nlh, const struct cmd *cmd, > > > + struct cmdl *cmdl, void *data) > > > +{ > > > + const struct cmd cmds[] = { > > > + { "key", cmd_node_flush_key, NULL }, > > > + { NULL } > > > + }; > > > + > > > + return run_cmd(nlh, cmd, cmds, cmdl, NULL); } > > > + > > > static void cmd_node_set_help(struct cmdl *cmdl) { > > > fprintf(stderr, > > > "Usage: %s node set PROPERTY\n\n" > > > "PROPERTIES\n" > > > " identity NODEID - Set node identity\n" > > > - " clusterid CLUSTERID - Set local cluster id\n", > > > + " clusterid CLUSTERID - Set local cluster id\n" > > > + " key PROPERTY - Set symmetric-key\n", > > > cmdl->argv[0]); > > > } > > > > > > @@ -288,6 +414,7 @@ static int cmd_node_set(struct nlmsghdr *nlh, > > const struct cmd *cmd, > > > { "identity", cmd_node_set_nodeid, NULL }, > > > { "netid", cmd_node_set_netid, NULL }, > > > { "clusterid", cmd_node_set_netid, NULL }, > > > + { "key", cmd_node_set_key, cmd_node_set_key_help }, > > > { NULL } > > > }; > > > > > > @@ -325,7 +452,8 @@ void cmd_node_help(struct cmdl *cmdl) > > > "COMMANDS\n" > > > " list - List remote nodes\n" > > > " get - Get local node parameters\n" > > > - " set - Set local node parameters\n", > > > + " set - Set local node parameters\n" > > > + " flush - Flush local node parameters\n", > > > cmdl->argv[0]); > > > } > > > > > > @@ -336,6 +464,7 @@ int cmd_node(struct nlmsghdr *nlh, const struct > > cmd *cmd, struct cmdl *cmdl, > > > { "list", cmd_node_list, NULL }, > > > { "get", cmd_node_get, cmd_node_get_help }, > > > { "set", cmd_node_set, cmd_node_set_help }, > > > + { "flush", cmd_node_flush, cmd_node_flush_help}, > > > { NULL } > > > }; > > > > > > |
From: Jon M. <jon...@er...> - 2019-11-20 17:03:37
|
Acked-by: Jon Maloy <jon...@er...> > -----Original Message----- > From: Hoang Le <hoa...@de...> > Sent: 20-Nov-19 03:39 > To: Jon Maloy <jon...@er...>; ma...@do...; tip...@li...; > yin...@wi... > Subject: [net-next] tipc: update replicast capability for broadcast send link > > When setting up a cluster with non-replicast/replicast capability > supported. This capability will be disabled for broadcast send link > in order to be backwards compatible. > > However, when these non-support nodes left and be removed out the cluster. > We don't update this capability on broadcast send link. Then, some of > features that based on this capability will also disabling as unexpected. > > In this commit, we make sure the broadcast send link capabilities will > be re-calculated as soon as a node removed/rejoined a cluster. > > Signed-off-by: Hoang Le <hoa...@de...> > --- > net/tipc/bcast.c | 4 ++-- > net/tipc/bcast.h | 2 +- > net/tipc/link.c | 2 +- > net/tipc/node.c | 8 +++++++- > 4 files changed, 11 insertions(+), 5 deletions(-) > > diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c > index 44ed481fec47..3d14e60ef642 100644 > --- a/net/tipc/bcast.c > +++ b/net/tipc/bcast.c > @@ -87,9 +87,9 @@ int tipc_bcast_get_mtu(struct net *net) > return tipc_link_mss(tipc_bc_sndlink(net)); > } > > -void tipc_bcast_disable_rcast(struct net *net) > +void tipc_bcast_toggle_rcast(struct net *net, bool supp) > { > - tipc_bc_base(net)->rcast_support = false; > + tipc_bc_base(net)->rcast_support = supp; > } > > static void tipc_bcbase_calc_bc_threshold(struct net *net) > diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h > index dadad953e2be..9e847d9617d3 100644 > --- a/net/tipc/bcast.h > +++ b/net/tipc/bcast.h > @@ -85,7 +85,7 @@ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_bcl); > void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id); > void tipc_bcast_dec_bearer_dst_cnt(struct net *net, int bearer_id); > int tipc_bcast_get_mtu(struct net *net); > -void tipc_bcast_disable_rcast(struct net *net); > +void tipc_bcast_toggle_rcast(struct net *net, bool supp); > int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts, > struct tipc_mc_method *method, struct tipc_nlist *dests, > u16 *cong_link_cnt); > diff --git a/net/tipc/link.c b/net/tipc/link.c > index a2e9a64d5a0f..5153b9bb7b3f 100644 > --- a/net/tipc/link.c > +++ b/net/tipc/link.c > @@ -550,7 +550,7 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, > > /* Disable replicast if even a single peer doesn't support it */ > if (link_is_bc_rcvlink(l) && !(peer_caps & TIPC_BCAST_RCAST)) > - tipc_bcast_disable_rcast(net); > + tipc_bcast_toggle_rcast(net, false); > > return true; > } > diff --git a/net/tipc/node.c b/net/tipc/node.c > index b058647fa78b..b9f6b5dfdb5b 100644 > --- a/net/tipc/node.c > +++ b/net/tipc/node.c > @@ -496,6 +496,9 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id, > tn->capabilities &= temp_node->capabilities; > } > > + tipc_bcast_toggle_rcast(net, > + (tn->capabilities & TIPC_BCAST_RCAST)); > + > goto exit; > } > n = kzalloc(sizeof(*n), GFP_ATOMIC); > @@ -557,6 +560,7 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id, > list_for_each_entry_rcu(temp_node, &tn->node_list, list) { > tn->capabilities &= temp_node->capabilities; > } > + tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); > trace_tipc_node_create(n, true, " "); > exit: > spin_unlock_bh(&tn->node_list_lock); > @@ -740,7 +744,8 @@ static bool tipc_node_cleanup(struct tipc_node *peer) > list_for_each_entry_rcu(temp_node, &tn->node_list, list) { > tn->capabilities &= temp_node->capabilities; > } > - > + tipc_bcast_toggle_rcast(peer->net, > + (tn->capabilities & TIPC_BCAST_RCAST)); > spin_unlock_bh(&tn->node_list_lock); > return deleted; > } > @@ -2198,6 +2203,7 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) > list_for_each_entry_rcu(temp_node, &tn->node_list, list) { > tn->capabilities &= temp_node->capabilities; > } > + tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); > err = 0; > err_out: > tipc_node_put(peer); > -- > 2.20.1 |
From: Xue, Y. <Yin...@wi...> - 2019-11-20 15:03:03
|
Hi Tuong, This patch is fine for us. Sure, you can my ack-by. Thanks, Yiing -----Original Message----- From: Tuong Lien Tong [mailto:tuo...@de...] Sent: Wednesday, November 20, 2019 11:37 AM To: 'Jon Maloy'; Xue, Ying; tip...@li...; ma...@do... Subject: RE: [iproute2] tipc: add new commands to set TIPC AEAD key Hi Jon/Ying, We still have this patch (i.e. for the 'tipc node set/flush key...' commands), may I put your ACK on it before sending to iproute2-next? Many thanks! BR/Tuong -----Original Message----- From: Jon Maloy <jon...@er...> Sent: Wednesday, October 16, 2019 9:51 PM To: Ying Xue <yin...@wi...>; Tuong Tong Lien <tuo...@de...>; tip...@li...; ma...@do... Subject: RE: [iproute2] tipc: add new commands to set TIPC AEAD key > -----Original Message----- > From: Ying Xue <yin...@wi...> > Sent: 16-Oct-19 08:30 > To: Tuong Tong Lien <tuo...@de...>; tipc- > dis...@li...; Jon Maloy <jon...@er...>; > ma...@do... > Subject: Re: [iproute2] tipc: add new commands to set TIPC AEAD key > > Tt looks like we will use "tipc node" command to configure static key to TIPC > module, right? The key is static in the sense that TIPC itself cannot change the key. But the protocol ensures that keys can be replaced without any traffic disturbances. > > Do we plan to support dynamic key setting? If yes, what kinds of key exchange > protocol would we use? For example, in IPSEC, it uses IKEv2 as its key > exchange protocol. At the moment we assume there is an external user land framework where node authentication is done and where keys are generated and distributed (via TLS) to the nodes. When we want to replace a key (probably at fix pre-defined intervals), the framework has to generate new keys and distribute/inject those to TIPC. > > Will key be expired after a specific lifetime? For instance, in > IPSEC/Raccoon2 or strongswan, they use rekey feature to provide this > function to make security association safer. We are considering this, so that the external framework can be kept simpler or even be eliminated. That would be the next step, once this series is applied. Regards ///jon > > On 10/14/19 7:36 PM, Tuong Lien wrote: > > Two new commands are added as part of 'tipc node' command: > > > > $tipc node set key KEY [algname ALGNAME] [nodeid NODEID] $tipc node > > flush key > > > > which enable user to set and remove AEAD keys in kernel TIPC. > > > > For the 'set key' command, the given 'nodeid' parameter decides the > > mode to be applied to the key, particularly: > > > > - If NODEID is empty, the key is a 'cluster' key which will be used > > for all message encryption/decryption from/to the node (i.e. both TX & RX). > > The same key needs to be set in the other nodes i.e. the 'cluster key' > > mode. > > > > - If NODEID is own node, the key is used for message encryption (TX) > > from the node. Whereas, if NODEID is a peer node, the key is for > > message decryption (RX) from that peer node. > > This is the 'per-node-key' mode that each nodes in the cluster has its > > specific (TX) key. > > > > Signed-off-by: Tuong Lien <tuo...@de...> > > --- > > include/uapi/linux/tipc.h | 21 ++++++ > > include/uapi/linux/tipc_netlink.h | 4 ++ > > tipc/misc.c | 38 +++++++++++ > > tipc/misc.h | 1 + > > tipc/node.c | 133 > +++++++++++++++++++++++++++++++++++++- > > 5 files changed, 195 insertions(+), 2 deletions(-) > > > > diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h > > index e16cb4e2..b118ce9b 100644 > > --- a/include/uapi/linux/tipc.h > > +++ b/include/uapi/linux/tipc.h > > @@ -232,6 +232,27 @@ struct tipc_sioc_nodeid_req { > > char node_id[TIPC_NODEID_LEN]; > > }; > > > > +/* > > + * TIPC Crypto, AEAD mode > > + */ > > +#define TIPC_AEAD_MAX_ALG_NAME (32) > > +#define TIPC_AEAD_MIN_KEYLEN (16 + 4) > > +#define TIPC_AEAD_MAX_KEYLEN (32 + 4) > > + > > +struct tipc_aead_key { > > + char alg_name[TIPC_AEAD_MAX_ALG_NAME]; > > + unsigned int keylen; /* in bytes */ > > + char key[]; > > +}; > > + > > +#define TIPC_AEAD_KEY_MAX_SIZE (sizeof(struct tipc_aead_key) + \ > > + TIPC_AEAD_MAX_KEYLEN) > > + > > +static inline int tipc_aead_key_size(struct tipc_aead_key *key) { > > + return sizeof(*key) + key->keylen; > > +} > > + > > /* The macros and functions below are deprecated: > > */ > > > > diff --git a/include/uapi/linux/tipc_netlink.h > > b/include/uapi/linux/tipc_netlink.h > > index efb958fd..6c2194ab 100644 > > --- a/include/uapi/linux/tipc_netlink.h > > +++ b/include/uapi/linux/tipc_netlink.h > > @@ -63,6 +63,8 @@ enum { > > TIPC_NL_PEER_REMOVE, > > TIPC_NL_BEARER_ADD, > > TIPC_NL_UDP_GET_REMOTEIP, > > + TIPC_NL_KEY_SET, > > + TIPC_NL_KEY_FLUSH, > > > > __TIPC_NL_CMD_MAX, > > TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 @@ -160,6 +162,8 > @@ enum { > > TIPC_NLA_NODE_UNSPEC, > > TIPC_NLA_NODE_ADDR, /* u32 */ > > TIPC_NLA_NODE_UP, /* flag */ > > + TIPC_NLA_NODE_ID, /* data */ > > + TIPC_NLA_NODE_KEY, /* data */ > > > > __TIPC_NLA_NODE_MAX, > > TIPC_NLA_NODE_MAX = __TIPC_NLA_NODE_MAX - 1 diff --git > a/tipc/misc.c > > b/tipc/misc.c index e4b1cd0c..1daf3072 100644 > > --- a/tipc/misc.c > > +++ b/tipc/misc.c > > @@ -98,6 +98,44 @@ int str2nodeid(char *str, uint8_t *id) > > return 0; > > } > > > > +int str2key(char *str, struct tipc_aead_key *key) { > > + int len = strlen(str); > > + int ishex = 0; > > + int i; > > + > > + /* Check if the input is a hex string (i.e. 0x...) */ > > + if (len > 2 && strncmp(str, "0x", 2) == 0) { > > + ishex = is_hex(str + 2, len - 2 - 1); > > + if (ishex) { > > + len -= 2; > > + str += 2; > > + } > > + } > > + > > + /* Obtain key: */ > > + if (!ishex) { > > + key->keylen = len; > > + memcpy(key->key, str, len); > > + } else { > > + /* Convert hex string to key */ > > + key->keylen = (len + 1) / 2; > > + for (i = 0; i < key->keylen; i++) { > > + if (i == 0 && len % 2 != 0) { > > + if (sscanf(str, "%1hhx", &key->key[0]) != 1) > > + return -1; > > + str += 1; > > + continue; > > + } > > + if (sscanf(str, "%2hhx", &key->key[i]) != 1) > > + return -1; > > + str += 2; > > + } > > + } > > + > > + return 0; > > +} > > + > > void nodeid2str(uint8_t *id, char *str) { > > int i; > > diff --git a/tipc/misc.h b/tipc/misc.h index ff2f31f1..59309f68 100644 > > --- a/tipc/misc.h > > +++ b/tipc/misc.h > > @@ -18,5 +18,6 @@ uint32_t str2addr(char *str); int str2nodeid(char > > *str, uint8_t *id); void nodeid2str(uint8_t *id, char *str); void > > hash2nodestr(uint32_t hash, char *str); > > +int str2key(char *str, struct tipc_aead_key *key); > > > > #endif > > diff --git a/tipc/node.c b/tipc/node.c index 2fec6753..fc81bd30 100644 > > --- a/tipc/node.c > > +++ b/tipc/node.c > > @@ -157,6 +157,111 @@ static int cmd_node_set_nodeid(struct nlmsghdr > *nlh, const struct cmd *cmd, > > return msg_doit(nlh, NULL, NULL); > > } > > > > +static void cmd_node_set_key_help(struct cmdl *cmdl) { > > + fprintf(stderr, > > + "Usage: %s node set key KEY [algname ALGNAME] [nodeid > NODEID]\n\n" > > + "PROPERTIES\n" > > + " KEY - Symmetric KEY & SALT as a normal or hex > string\n" > > + " that consists of two parts:\n" > > + " [KEY: 16, 24 or 32 octets][SALT: 4 octets]\n\n" > > + " algname ALGNAME - Default: \"gcm(aes)\"\n\n" > > + " nodeid NODEID - Own or peer node identity to which the > key will\n" > > + " be attached. If not present, the key is a cluster\n" > > + " key!\n\n" > > + "EXAMPLES\n" > > + " %s node set key this_is_a_key16_salt algname \"gcm(aes)\" > nodeid node1\n" > > + " %s node set key > 0x746869735F69735F615F6B657931365F73616C74 nodeid node2\n\n", > > + cmdl->argv[0], cmdl->argv[0], cmdl->argv[0]); } > > + > > +static int cmd_node_set_key(struct nlmsghdr *nlh, const struct cmd *cmd, > > + struct cmdl *cmdl, void *data) { > > + struct { > > + struct tipc_aead_key key; > > + char mem[TIPC_AEAD_MAX_KEYLEN + 1]; > > + } input = {}; > > + struct opt opts[] = { > > + { "algname", OPT_KEYVAL, NULL }, > > + { "nodeid", OPT_KEYVAL, NULL }, > > + { NULL } > > + }; > > + struct nlattr *nest; > > + struct opt *opt_algname, *opt_nodeid; > > + char buf[MNL_SOCKET_BUFFER_SIZE]; > > + uint8_t id[TIPC_NODEID_LEN] = {0,}; > > + int keysize; > > + char *str; > > + > > + if (help_flag) { > > + (cmd->help)(cmdl); > > + return -EINVAL; > > + } > > + > > + if (cmdl->optind >= cmdl->argc) { > > + fprintf(stderr, "error, missing key\n"); > > + return -EINVAL; > > + } > > + > > + /* Get user key */ > > + str = shift_cmdl(cmdl); > > + if (str2key(str, &input.key)) { > > + fprintf(stderr, "error, invalid key input\n"); > > + return -EINVAL; > > + } > > + > > + if (parse_opts(opts, cmdl) < 0) > > + return -EINVAL; > > + > > + /* Get algorithm name, default: "gcm(aes)" */ > > + opt_algname = get_opt(opts, "algname"); > > + if (!opt_algname) > > + strcpy(input.key.alg_name, "gcm(aes)"); > > + else > > + strcpy(input.key.alg_name, opt_algname->val); > > + > > + /* Get node identity */ > > + opt_nodeid = get_opt(opts, "nodeid"); > > + if (opt_nodeid && str2nodeid(opt_nodeid->val, id)) { > > + fprintf(stderr, "error, invalid node identity\n"); > > + return -EINVAL; > > + } > > + > > + /* Init & do the command */ > > + nlh = msg_init(buf, TIPC_NL_KEY_SET); > > + if (!nlh) { > > + fprintf(stderr, "error, message initialisation failed\n"); > > + return -1; > > + } > > + nest = mnl_attr_nest_start(nlh, TIPC_NLA_NODE); > > + keysize = tipc_aead_key_size(&input.key); > > + mnl_attr_put(nlh, TIPC_NLA_NODE_KEY, keysize, &input.key); > > + if (opt_nodeid) > > + mnl_attr_put(nlh, TIPC_NLA_NODE_ID, TIPC_NODEID_LEN, id); > > + mnl_attr_nest_end(nlh, nest); > > + return msg_doit(nlh, NULL, NULL); > > +} > > + > > +static int cmd_node_flush_key(struct nlmsghdr *nlh, const struct cmd > *cmd, > > + struct cmdl *cmdl, void *data) { > > + char buf[MNL_SOCKET_BUFFER_SIZE]; > > + > > + if (help_flag) { > > + (cmd->help)(cmdl); > > + return -EINVAL; > > + } > > + > > + /* Init & do the command */ > > + nlh = msg_init(buf, TIPC_NL_KEY_FLUSH); > > + if (!nlh) { > > + fprintf(stderr, "error, message initialisation failed\n"); > > + return -1; > > + } > > + return msg_doit(nlh, NULL, NULL); > > +} > > + > > static int nodeid_get_cb(const struct nlmsghdr *nlh, void *data) { > > struct nlattr *info[TIPC_NLA_MAX + 1] = {}; @@ -270,13 +375,34 @@ > > static int cmd_node_set_netid(struct nlmsghdr *nlh, const struct cmd *cmd, > > return msg_doit(nlh, NULL, NULL); > > } > > > > +static void cmd_node_flush_help(struct cmdl *cmdl) { > > + fprintf(stderr, > > + "Usage: %s node flush PROPERTY\n\n" > > + "PROPERTIES\n" > > + " key - Flush all symmetric-keys\n", > > + cmdl->argv[0]); > > +} > > + > > +static int cmd_node_flush(struct nlmsghdr *nlh, const struct cmd *cmd, > > + struct cmdl *cmdl, void *data) > > +{ > > + const struct cmd cmds[] = { > > + { "key", cmd_node_flush_key, NULL }, > > + { NULL } > > + }; > > + > > + return run_cmd(nlh, cmd, cmds, cmdl, NULL); } > > + > > static void cmd_node_set_help(struct cmdl *cmdl) { > > fprintf(stderr, > > "Usage: %s node set PROPERTY\n\n" > > "PROPERTIES\n" > > " identity NODEID - Set node identity\n" > > - " clusterid CLUSTERID - Set local cluster id\n", > > + " clusterid CLUSTERID - Set local cluster id\n" > > + " key PROPERTY - Set symmetric-key\n", > > cmdl->argv[0]); > > } > > > > @@ -288,6 +414,7 @@ static int cmd_node_set(struct nlmsghdr *nlh, > const struct cmd *cmd, > > { "identity", cmd_node_set_nodeid, NULL }, > > { "netid", cmd_node_set_netid, NULL }, > > { "clusterid", cmd_node_set_netid, NULL }, > > + { "key", cmd_node_set_key, cmd_node_set_key_help }, > > { NULL } > > }; > > > > @@ -325,7 +452,8 @@ void cmd_node_help(struct cmdl *cmdl) > > "COMMANDS\n" > > " list - List remote nodes\n" > > " get - Get local node parameters\n" > > - " set - Set local node parameters\n", > > + " set - Set local node parameters\n" > > + " flush - Flush local node parameters\n", > > cmdl->argv[0]); > > } > > > > @@ -336,6 +464,7 @@ int cmd_node(struct nlmsghdr *nlh, const struct > cmd *cmd, struct cmdl *cmdl, > > { "list", cmd_node_list, NULL }, > > { "get", cmd_node_get, cmd_node_get_help }, > > { "set", cmd_node_set, cmd_node_set_help }, > > + { "flush", cmd_node_flush, cmd_node_flush_help}, > > { NULL } > > }; > > > > |
From: Hoang Le <hoa...@de...> - 2019-11-20 08:39:21
|
When setting up a cluster with non-replicast/replicast capability supported. This capability will be disabled for broadcast send link in order to be backwards compatible. However, when these non-support nodes left and be removed out the cluster. We don't update this capability on broadcast send link. Then, some of features that based on this capability will also disabling as unexpected. In this commit, we make sure the broadcast send link capabilities will be re-calculated as soon as a node removed/rejoined a cluster. Signed-off-by: Hoang Le <hoa...@de...> --- net/tipc/bcast.c | 4 ++-- net/tipc/bcast.h | 2 +- net/tipc/link.c | 2 +- net/tipc/node.c | 8 +++++++- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 44ed481fec47..3d14e60ef642 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -87,9 +87,9 @@ int tipc_bcast_get_mtu(struct net *net) return tipc_link_mss(tipc_bc_sndlink(net)); } -void tipc_bcast_disable_rcast(struct net *net) +void tipc_bcast_toggle_rcast(struct net *net, bool supp) { - tipc_bc_base(net)->rcast_support = false; + tipc_bc_base(net)->rcast_support = supp; } static void tipc_bcbase_calc_bc_threshold(struct net *net) diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index dadad953e2be..9e847d9617d3 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -85,7 +85,7 @@ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_bcl); void tipc_bcast_inc_bearer_dst_cnt(struct net *net, int bearer_id); void tipc_bcast_dec_bearer_dst_cnt(struct net *net, int bearer_id); int tipc_bcast_get_mtu(struct net *net); -void tipc_bcast_disable_rcast(struct net *net); +void tipc_bcast_toggle_rcast(struct net *net, bool supp); int tipc_mcast_xmit(struct net *net, struct sk_buff_head *pkts, struct tipc_mc_method *method, struct tipc_nlist *dests, u16 *cong_link_cnt); diff --git a/net/tipc/link.c b/net/tipc/link.c index a2e9a64d5a0f..5153b9bb7b3f 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -550,7 +550,7 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, /* Disable replicast if even a single peer doesn't support it */ if (link_is_bc_rcvlink(l) && !(peer_caps & TIPC_BCAST_RCAST)) - tipc_bcast_disable_rcast(net); + tipc_bcast_toggle_rcast(net, false); return true; } diff --git a/net/tipc/node.c b/net/tipc/node.c index b058647fa78b..b9f6b5dfdb5b 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -496,6 +496,9 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id, tn->capabilities &= temp_node->capabilities; } + tipc_bcast_toggle_rcast(net, + (tn->capabilities & TIPC_BCAST_RCAST)); + goto exit; } n = kzalloc(sizeof(*n), GFP_ATOMIC); @@ -557,6 +560,7 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id, list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } + tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); trace_tipc_node_create(n, true, " "); exit: spin_unlock_bh(&tn->node_list_lock); @@ -740,7 +744,8 @@ static bool tipc_node_cleanup(struct tipc_node *peer) list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } - + tipc_bcast_toggle_rcast(peer->net, + (tn->capabilities & TIPC_BCAST_RCAST)); spin_unlock_bh(&tn->node_list_lock); return deleted; } @@ -2198,6 +2203,7 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } + tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); err = 0; err_out: tipc_node_put(peer); -- 2.20.1 |
From: Tuong L. T. <tuo...@de...> - 2019-11-20 03:36:54
|
Hi Jon/Ying, We still have this patch (i.e. for the 'tipc node set/flush key...' commands), may I put your ACK on it before sending to iproute2-next? Many thanks! BR/Tuong -----Original Message----- From: Jon Maloy <jon...@er...> Sent: Wednesday, October 16, 2019 9:51 PM To: Ying Xue <yin...@wi...>; Tuong Tong Lien <tuo...@de...>; tip...@li...; ma...@do... Subject: RE: [iproute2] tipc: add new commands to set TIPC AEAD key > -----Original Message----- > From: Ying Xue <yin...@wi...> > Sent: 16-Oct-19 08:30 > To: Tuong Tong Lien <tuo...@de...>; tipc- > dis...@li...; Jon Maloy <jon...@er...>; > ma...@do... > Subject: Re: [iproute2] tipc: add new commands to set TIPC AEAD key > > Tt looks like we will use "tipc node" command to configure static key to TIPC > module, right? The key is static in the sense that TIPC itself cannot change the key. But the protocol ensures that keys can be replaced without any traffic disturbances. > > Do we plan to support dynamic key setting? If yes, what kinds of key exchange > protocol would we use? For example, in IPSEC, it uses IKEv2 as its key > exchange protocol. At the moment we assume there is an external user land framework where node authentication is done and where keys are generated and distributed (via TLS) to the nodes. When we want to replace a key (probably at fix pre-defined intervals), the framework has to generate new keys and distribute/inject those to TIPC. > > Will key be expired after a specific lifetime? For instance, in > IPSEC/Raccoon2 or strongswan, they use rekey feature to provide this > function to make security association safer. We are considering this, so that the external framework can be kept simpler or even be eliminated. That would be the next step, once this series is applied. Regards ///jon > > On 10/14/19 7:36 PM, Tuong Lien wrote: > > Two new commands are added as part of 'tipc node' command: > > > > $tipc node set key KEY [algname ALGNAME] [nodeid NODEID] $tipc node > > flush key > > > > which enable user to set and remove AEAD keys in kernel TIPC. > > > > For the 'set key' command, the given 'nodeid' parameter decides the > > mode to be applied to the key, particularly: > > > > - If NODEID is empty, the key is a 'cluster' key which will be used > > for all message encryption/decryption from/to the node (i.e. both TX & RX). > > The same key needs to be set in the other nodes i.e. the 'cluster key' > > mode. > > > > - If NODEID is own node, the key is used for message encryption (TX) > > from the node. Whereas, if NODEID is a peer node, the key is for > > message decryption (RX) from that peer node. > > This is the 'per-node-key' mode that each nodes in the cluster has its > > specific (TX) key. > > > > Signed-off-by: Tuong Lien <tuo...@de...> > > --- > > include/uapi/linux/tipc.h | 21 ++++++ > > include/uapi/linux/tipc_netlink.h | 4 ++ > > tipc/misc.c | 38 +++++++++++ > > tipc/misc.h | 1 + > > tipc/node.c | 133 > +++++++++++++++++++++++++++++++++++++- > > 5 files changed, 195 insertions(+), 2 deletions(-) > > > > diff --git a/include/uapi/linux/tipc.h b/include/uapi/linux/tipc.h > > index e16cb4e2..b118ce9b 100644 > > --- a/include/uapi/linux/tipc.h > > +++ b/include/uapi/linux/tipc.h > > @@ -232,6 +232,27 @@ struct tipc_sioc_nodeid_req { > > char node_id[TIPC_NODEID_LEN]; > > }; > > > > +/* > > + * TIPC Crypto, AEAD mode > > + */ > > +#define TIPC_AEAD_MAX_ALG_NAME (32) > > +#define TIPC_AEAD_MIN_KEYLEN (16 + 4) > > +#define TIPC_AEAD_MAX_KEYLEN (32 + 4) > > + > > +struct tipc_aead_key { > > + char alg_name[TIPC_AEAD_MAX_ALG_NAME]; > > + unsigned int keylen; /* in bytes */ > > + char key[]; > > +}; > > + > > +#define TIPC_AEAD_KEY_MAX_SIZE (sizeof(struct tipc_aead_key) + \ > > + TIPC_AEAD_MAX_KEYLEN) > > + > > +static inline int tipc_aead_key_size(struct tipc_aead_key *key) { > > + return sizeof(*key) + key->keylen; > > +} > > + > > /* The macros and functions below are deprecated: > > */ > > > > diff --git a/include/uapi/linux/tipc_netlink.h > > b/include/uapi/linux/tipc_netlink.h > > index efb958fd..6c2194ab 100644 > > --- a/include/uapi/linux/tipc_netlink.h > > +++ b/include/uapi/linux/tipc_netlink.h > > @@ -63,6 +63,8 @@ enum { > > TIPC_NL_PEER_REMOVE, > > TIPC_NL_BEARER_ADD, > > TIPC_NL_UDP_GET_REMOTEIP, > > + TIPC_NL_KEY_SET, > > + TIPC_NL_KEY_FLUSH, > > > > __TIPC_NL_CMD_MAX, > > TIPC_NL_CMD_MAX = __TIPC_NL_CMD_MAX - 1 @@ -160,6 +162,8 > @@ enum { > > TIPC_NLA_NODE_UNSPEC, > > TIPC_NLA_NODE_ADDR, /* u32 */ > > TIPC_NLA_NODE_UP, /* flag */ > > + TIPC_NLA_NODE_ID, /* data */ > > + TIPC_NLA_NODE_KEY, /* data */ > > > > __TIPC_NLA_NODE_MAX, > > TIPC_NLA_NODE_MAX = __TIPC_NLA_NODE_MAX - 1 diff --git > a/tipc/misc.c > > b/tipc/misc.c index e4b1cd0c..1daf3072 100644 > > --- a/tipc/misc.c > > +++ b/tipc/misc.c > > @@ -98,6 +98,44 @@ int str2nodeid(char *str, uint8_t *id) > > return 0; > > } > > > > +int str2key(char *str, struct tipc_aead_key *key) { > > + int len = strlen(str); > > + int ishex = 0; > > + int i; > > + > > + /* Check if the input is a hex string (i.e. 0x...) */ > > + if (len > 2 && strncmp(str, "0x", 2) == 0) { > > + ishex = is_hex(str + 2, len - 2 - 1); > > + if (ishex) { > > + len -= 2; > > + str += 2; > > + } > > + } > > + > > + /* Obtain key: */ > > + if (!ishex) { > > + key->keylen = len; > > + memcpy(key->key, str, len); > > + } else { > > + /* Convert hex string to key */ > > + key->keylen = (len + 1) / 2; > > + for (i = 0; i < key->keylen; i++) { > > + if (i == 0 && len % 2 != 0) { > > + if (sscanf(str, "%1hhx", &key->key[0]) != 1) > > + return -1; > > + str += 1; > > + continue; > > + } > > + if (sscanf(str, "%2hhx", &key->key[i]) != 1) > > + return -1; > > + str += 2; > > + } > > + } > > + > > + return 0; > > +} > > + > > void nodeid2str(uint8_t *id, char *str) { > > int i; > > diff --git a/tipc/misc.h b/tipc/misc.h index ff2f31f1..59309f68 100644 > > --- a/tipc/misc.h > > +++ b/tipc/misc.h > > @@ -18,5 +18,6 @@ uint32_t str2addr(char *str); int str2nodeid(char > > *str, uint8_t *id); void nodeid2str(uint8_t *id, char *str); void > > hash2nodestr(uint32_t hash, char *str); > > +int str2key(char *str, struct tipc_aead_key *key); > > > > #endif > > diff --git a/tipc/node.c b/tipc/node.c index 2fec6753..fc81bd30 100644 > > --- a/tipc/node.c > > +++ b/tipc/node.c > > @@ -157,6 +157,111 @@ static int cmd_node_set_nodeid(struct nlmsghdr > *nlh, const struct cmd *cmd, > > return msg_doit(nlh, NULL, NULL); > > } > > > > +static void cmd_node_set_key_help(struct cmdl *cmdl) { > > + fprintf(stderr, > > + "Usage: %s node set key KEY [algname ALGNAME] [nodeid > NODEID]\n\n" > > + "PROPERTIES\n" > > + " KEY - Symmetric KEY & SALT as a normal or hex > string\n" > > + " that consists of two parts:\n" > > + " [KEY: 16, 24 or 32 octets][SALT: 4 octets]\n\n" > > + " algname ALGNAME - Default: \"gcm(aes)\"\n\n" > > + " nodeid NODEID - Own or peer node identity to which the > key will\n" > > + " be attached. If not present, the key is a cluster\n" > > + " key!\n\n" > > + "EXAMPLES\n" > > + " %s node set key this_is_a_key16_salt algname \"gcm(aes)\" > nodeid node1\n" > > + " %s node set key > 0x746869735F69735F615F6B657931365F73616C74 nodeid node2\n\n", > > + cmdl->argv[0], cmdl->argv[0], cmdl->argv[0]); } > > + > > +static int cmd_node_set_key(struct nlmsghdr *nlh, const struct cmd *cmd, > > + struct cmdl *cmdl, void *data) { > > + struct { > > + struct tipc_aead_key key; > > + char mem[TIPC_AEAD_MAX_KEYLEN + 1]; > > + } input = {}; > > + struct opt opts[] = { > > + { "algname", OPT_KEYVAL, NULL }, > > + { "nodeid", OPT_KEYVAL, NULL }, > > + { NULL } > > + }; > > + struct nlattr *nest; > > + struct opt *opt_algname, *opt_nodeid; > > + char buf[MNL_SOCKET_BUFFER_SIZE]; > > + uint8_t id[TIPC_NODEID_LEN] = {0,}; > > + int keysize; > > + char *str; > > + > > + if (help_flag) { > > + (cmd->help)(cmdl); > > + return -EINVAL; > > + } > > + > > + if (cmdl->optind >= cmdl->argc) { > > + fprintf(stderr, "error, missing key\n"); > > + return -EINVAL; > > + } > > + > > + /* Get user key */ > > + str = shift_cmdl(cmdl); > > + if (str2key(str, &input.key)) { > > + fprintf(stderr, "error, invalid key input\n"); > > + return -EINVAL; > > + } > > + > > + if (parse_opts(opts, cmdl) < 0) > > + return -EINVAL; > > + > > + /* Get algorithm name, default: "gcm(aes)" */ > > + opt_algname = get_opt(opts, "algname"); > > + if (!opt_algname) > > + strcpy(input.key.alg_name, "gcm(aes)"); > > + else > > + strcpy(input.key.alg_name, opt_algname->val); > > + > > + /* Get node identity */ > > + opt_nodeid = get_opt(opts, "nodeid"); > > + if (opt_nodeid && str2nodeid(opt_nodeid->val, id)) { > > + fprintf(stderr, "error, invalid node identity\n"); > > + return -EINVAL; > > + } > > + > > + /* Init & do the command */ > > + nlh = msg_init(buf, TIPC_NL_KEY_SET); > > + if (!nlh) { > > + fprintf(stderr, "error, message initialisation failed\n"); > > + return -1; > > + } > > + nest = mnl_attr_nest_start(nlh, TIPC_NLA_NODE); > > + keysize = tipc_aead_key_size(&input.key); > > + mnl_attr_put(nlh, TIPC_NLA_NODE_KEY, keysize, &input.key); > > + if (opt_nodeid) > > + mnl_attr_put(nlh, TIPC_NLA_NODE_ID, TIPC_NODEID_LEN, id); > > + mnl_attr_nest_end(nlh, nest); > > + return msg_doit(nlh, NULL, NULL); > > +} > > + > > +static int cmd_node_flush_key(struct nlmsghdr *nlh, const struct cmd > *cmd, > > + struct cmdl *cmdl, void *data) { > > + char buf[MNL_SOCKET_BUFFER_SIZE]; > > + > > + if (help_flag) { > > + (cmd->help)(cmdl); > > + return -EINVAL; > > + } > > + > > + /* Init & do the command */ > > + nlh = msg_init(buf, TIPC_NL_KEY_FLUSH); > > + if (!nlh) { > > + fprintf(stderr, "error, message initialisation failed\n"); > > + return -1; > > + } > > + return msg_doit(nlh, NULL, NULL); > > +} > > + > > static int nodeid_get_cb(const struct nlmsghdr *nlh, void *data) { > > struct nlattr *info[TIPC_NLA_MAX + 1] = {}; @@ -270,13 +375,34 @@ > > static int cmd_node_set_netid(struct nlmsghdr *nlh, const struct cmd *cmd, > > return msg_doit(nlh, NULL, NULL); > > } > > > > +static void cmd_node_flush_help(struct cmdl *cmdl) { > > + fprintf(stderr, > > + "Usage: %s node flush PROPERTY\n\n" > > + "PROPERTIES\n" > > + " key - Flush all symmetric-keys\n", > > + cmdl->argv[0]); > > +} > > + > > +static int cmd_node_flush(struct nlmsghdr *nlh, const struct cmd *cmd, > > + struct cmdl *cmdl, void *data) > > +{ > > + const struct cmd cmds[] = { > > + { "key", cmd_node_flush_key, NULL }, > > + { NULL } > > + }; > > + > > + return run_cmd(nlh, cmd, cmds, cmdl, NULL); } > > + > > static void cmd_node_set_help(struct cmdl *cmdl) { > > fprintf(stderr, > > "Usage: %s node set PROPERTY\n\n" > > "PROPERTIES\n" > > " identity NODEID - Set node identity\n" > > - " clusterid CLUSTERID - Set local cluster id\n", > > + " clusterid CLUSTERID - Set local cluster id\n" > > + " key PROPERTY - Set symmetric-key\n", > > cmdl->argv[0]); > > } > > > > @@ -288,6 +414,7 @@ static int cmd_node_set(struct nlmsghdr *nlh, > const struct cmd *cmd, > > { "identity", cmd_node_set_nodeid, NULL }, > > { "netid", cmd_node_set_netid, NULL }, > > { "clusterid", cmd_node_set_netid, NULL }, > > + { "key", cmd_node_set_key, cmd_node_set_key_help }, > > { NULL } > > }; > > > > @@ -325,7 +452,8 @@ void cmd_node_help(struct cmdl *cmdl) > > "COMMANDS\n" > > " list - List remote nodes\n" > > " get - Get local node parameters\n" > > - " set - Set local node parameters\n", > > + " set - Set local node parameters\n" > > + " flush - Flush local node parameters\n", > > cmdl->argv[0]); > > } > > > > @@ -336,6 +464,7 @@ int cmd_node(struct nlmsghdr *nlh, const struct > cmd *cmd, struct cmdl *cmdl, > > { "list", cmd_node_list, NULL }, > > { "get", cmd_node_get, cmd_node_get_help }, > > { "set", cmd_node_set, cmd_node_set_help }, > > + { "flush", cmd_node_flush, cmd_node_flush_help}, > > { NULL } > > }; > > > > |
From: Tuong L. <tuo...@de...> - 2019-11-20 02:15:41
|
It is observed that TIPC service binding order will not be kept in the publication event report to user if the service is subscribed after the bindings. For example, services are bound by application in the following order: Server: bound port A to {18888,66,66} scope 2 Server: bound port A to {18888,33,33} scope 2 Now, if a client subscribes to the service range (e.g. {18888, 0-100}), it will get the 'TIPC_PUBLISHED' events in that binding order only when the subscription is started before the bindings. Otherwise, if started after the bindings, the events will arrive in the opposite order: Client: received event for published {18888,33,33} Client: received event for published {18888,66,66} For the latter case, it is clear that the bindings have existed in the name table already, so when reported, the events' order will follow the order of the rbtree binding nodes (- a node with lesser 'lower'/'upper' range value will be first). This is correct as we provide the tracking on a specific service status (available or not), not the relationship between multiple services. However, some users expect to see the same order of arriving events irrespective of when the subscription is issued. This turns out to be easy to fix. We now add functionality to ensure that publication events always are issued in the same temporal order as the corresponding bindings were performed. Acked-by: Jon Maloy <jon...@er...> Signed-off-by: Tuong Lien <tuo...@de...> --- net/tipc/name_table.c | 48 ++++++++++++++++++++++++++++++++++++++++-------- net/tipc/name_table.h | 4 ++++ 2 files changed, 44 insertions(+), 8 deletions(-) diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 66a65c2cdb23..898a4ad5909d 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -35,6 +35,7 @@ */ #include <net/sock.h> +#include <linux/list_sort.h> #include "core.h" #include "netlink.h" #include "name_table.h" @@ -66,6 +67,7 @@ struct service_range { /** * struct tipc_service - container for all published instances of a service type * @type: 32 bit 'type' value for service + * @publ_cnt: increasing counter for publications in this service * @ranges: rb tree containing all service ranges for this service * @service_list: links to adjacent name ranges in hash chain * @subscriptions: list of subscriptions for this service type @@ -74,6 +76,7 @@ struct service_range { */ struct tipc_service { u32 type; + unsigned int publ_cnt; struct rb_root ranges; struct hlist_node service_list; struct list_head subscriptions; @@ -109,6 +112,7 @@ static struct publication *tipc_publ_create(u32 type, u32 lower, u32 upper, INIT_LIST_HEAD(&publ->binding_node); INIT_LIST_HEAD(&publ->local_publ); INIT_LIST_HEAD(&publ->all_publ); + INIT_LIST_HEAD(&publ->list); return publ; } @@ -244,6 +248,8 @@ static struct publication *tipc_service_insert_publ(struct net *net, p = tipc_publ_create(type, lower, upper, scope, node, port, key); if (!p) goto err; + /* Suppose there shouldn't be a huge gap btw publs i.e. >INT_MAX */ + p->id = sc->publ_cnt++; if (in_own_node(net, node)) list_add(&p->local_publ, &sr->local_publ); list_add(&p->all_publ, &sr->all_publ); @@ -277,6 +283,17 @@ static struct publication *tipc_service_remove_publ(struct service_range *sr, return NULL; } +#define publication_after(pa, pb) (((int)((pb)->id - (pa)->id) < 0)) +static int tipc_publ_sort(void *priv, struct list_head *a, + struct list_head *b) +{ + struct publication *pa, *pb; + + pa = container_of(a, struct publication, list); + pb = container_of(b, struct publication, list); + return publication_after(pa, pb); +} + /** * tipc_service_subscribe - attach a subscription, and optionally * issue the prescribed number of events if there is any service @@ -286,36 +303,51 @@ static void tipc_service_subscribe(struct tipc_service *service, struct tipc_subscription *sub) { struct tipc_subscr *sb = &sub->evt.s; + struct publication *p, *first, *tmp; + struct list_head publ_list; struct service_range *sr; struct tipc_name_seq ns; - struct publication *p; struct rb_node *n; - bool first; + u32 filter; ns.type = tipc_sub_read(sb, seq.type); ns.lower = tipc_sub_read(sb, seq.lower); ns.upper = tipc_sub_read(sb, seq.upper); + filter = tipc_sub_read(sb, filter); tipc_sub_get(sub); list_add(&sub->service_list, &service->subscriptions); - if (tipc_sub_read(sb, filter) & TIPC_SUB_NO_STATUS) + if (filter & TIPC_SUB_NO_STATUS) return; + INIT_LIST_HEAD(&publ_list); for (n = rb_first(&service->ranges); n; n = rb_next(n)) { sr = container_of(n, struct service_range, tree_node); if (sr->lower > ns.upper) break; if (!tipc_sub_check_overlap(&ns, sr->lower, sr->upper)) continue; - first = true; + first = NULL; list_for_each_entry(p, &sr->all_publ, all_publ) { - tipc_sub_report_overlap(sub, sr->lower, sr->upper, - TIPC_PUBLISHED, p->port, - p->node, p->scope, first); - first = false; + if (filter & TIPC_SUB_PORTS) + list_add_tail(&p->list, &publ_list); + else if (!first || publication_after(first, p)) + /* Pick this range's *first* publication */ + first = p; } + if (first) + list_add_tail(&first->list, &publ_list); + } + + /* Sort the publications before reporting */ + list_sort(NULL, &publ_list, tipc_publ_sort); + list_for_each_entry_safe(p, tmp, &publ_list, list) { + tipc_sub_report_overlap(sub, p->lower, p->upper, + TIPC_PUBLISHED, p->port, p->node, + p->scope, true); + list_del_init(&p->list); } } diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index f79066334cc8..3d5da71ce41e 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -58,6 +58,7 @@ struct tipc_group; * @node: network address of publishing socket's node * @port: publishing port * @key: publication key, unique across the cluster + * @id: publication id * @binding_node: all publications from the same node which bound this one * - Remote publications: in node->publ_list * Used by node/name distr to withdraw publications when node is lost @@ -69,6 +70,7 @@ struct tipc_group; * Used by closest_first and multicast receive lookup algorithms * @all_publ: all publications identical to this one, whatever node and scope * Used by round-robin lookup algorithm + * @list: to form a list of publications in temporal order * @rcu: RCU callback head used for deferred freeing */ struct publication { @@ -79,10 +81,12 @@ struct publication { u32 node; u32 port; u32 key; + unsigned int id; struct list_head binding_node; struct list_head binding_sock; struct list_head local_publ; struct list_head all_publ; + struct list_head list; struct rcu_head rcu; }; -- 2.13.7 |
From: Xue, Y. <Yin...@wi...> - 2019-11-19 16:47:52
|
Hi Jon, I don't know you ever remembered many years ago I ever implemented a prototype to introduce slow-start and traffic congestion avoidance algorithms on link layer :) In my experiment, there were a few of meaningful findings: - It was crucial for the throughput performance about how to set initial congestion window and upper congestion window size. - It was found that different Ethernet speeds, different CPU capabilities and different message sizes all have a big impact on throughput performance. I did lots of experiments, as a result, sometimes performance improvement was very obvious, but sometimes, performance improvement was minimal even when I measured throughput in a similar network environment. Particularly, if I slightly changed some test conditions, throughput improvement results were also quite different. At that moment, I ever doubted whether we needed to do the following changes: 1. Should we introduce a timer to measure RTT and identify whether network congestion happens or not? 2. Should we change message delivery mode from message-oriented to byte-oriented? Of course, in my experiment I didn't make so big changes. So I want to know: - How did you select the minimum window size and maximum window size? - Did you measure TIPC throughput performance on different Ethernets? Including large message size test and small message size test. - Did you meet similar phenomena to me when we slightly changed test condition? In this proposal, during slow-start stage, the window increase is pretty slow: + if (qlen >= cwin && (l->snd_nxt - buf_seqno(skb_peek(txq)) == qlen)) { + add = l->cong_acks++ % 32 ? 0 : 1; + cwin = min_t(u16, cwin + add, l->max_win); + l->window = cwin; + } But in TCP slow-start algorithm, during slow-start stage congestion window increase is much more aggressive than above. As long as congestion window exceeds slow-start threshold, it enters congestion avoidance stage in which congestion window increases slowly. I am curious why the congestion window increase is pretty conservative compared to TCP slow-start algorithm. What factors did you consider when you selected the algorithm? Thanks, Ying -----Original Message----- From: Jon Maloy [mailto:jon...@er...] Sent: Tuesday, November 19, 2019 7:33 AM To: Jon Maloy; Jon Maloy Cc: moh...@er...; par...@gm...; tun...@de...; hoa...@de...; tuo...@de...; gor...@de...; Xue, Ying; tip...@li... Subject: [net-next v3 1/1] tipc: introduce variable window congestion control We introduce a simple variable window congestion control for links. The algorithm is inspired by the Reno algorithm, and can best be descibed as working in permanent "congestion avoidance" mode, within strict limits. - We introduce hard lower and upper window limits per link, still different and configurable per bearer type. - Next, we let a link start at the minimum window, and then slowly increment it for each 32 received non-duplicate ACK. This goes on until it either reaches the upper limit, or until it receives a NACK message. - For each non-duplicate NACK received, we let the window decrease by intervals of 1/2 of the current window, but not below the minimum window. The change does in reality have effect only on unicast ethernet transport, as we have seen that there is no room whatsoever for increasing the window max size for the UDP bearer. For now, we also choose to keep the limits for the broadcast link unchanged and equal. This algorithm seems to give a ~25% throughput improvement for large messages, while it has no effect on throughput for small messages. Suggested-by: Xin Long <luc...@gm...> Acked-by: Xin Long <luc...@gm...> Signed-off-by: Jon Maloy <jon...@er...> --- v2: - Moved window increment in tipc_advance_backlogq() to before the transfer loop, as suggested Tuong. - Introduced logic for incrementing the window even for the broadcast send link, also suggested by Tuong. v3: - Rebased to latest net-next --- net/tipc/bcast.c | 11 ++++---- net/tipc/bearer.c | 11 ++++---- net/tipc/bearer.h | 6 +++-- net/tipc/eth_media.c | 6 ++++- net/tipc/ib_media.c | 5 +++- net/tipc/link.c | 76 ++++++++++++++++++++++++++++++++++------------------ net/tipc/link.h | 9 ++++--- net/tipc/node.c | 13 +++++---- net/tipc/udp_media.c | 3 ++- 9 files changed, 90 insertions(+), 50 deletions(-) diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index f41096a..84da317 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -562,18 +562,18 @@ int tipc_bclink_reset_stats(struct net *net) return 0; } -static int tipc_bc_link_set_queue_limits(struct net *net, u32 limit) +static int tipc_bc_link_set_queue_limits(struct net *net, u32 max_win) { struct tipc_link *l = tipc_bc_sndlink(net); if (!l) return -ENOPROTOOPT; - if (limit < BCLINK_WIN_MIN) - limit = BCLINK_WIN_MIN; - if (limit > TIPC_MAX_LINK_WIN) + if (max_win < BCLINK_WIN_MIN) + max_win = BCLINK_WIN_MIN; + if (max_win > TIPC_MAX_LINK_WIN) return -EINVAL; tipc_bcast_lock(net); - tipc_link_set_queue_limits(l, limit); + tipc_link_set_queue_limits(l, BCLINK_WIN_MIN, max_win); tipc_bcast_unlock(net); return 0; } @@ -683,6 +683,7 @@ int tipc_bcast_init(struct net *net) if (!tipc_link_bc_create(net, 0, 0, FB_MTU, BCLINK_WIN_DEFAULT, + BCLINK_WIN_DEFAULT, 0, &bb->inputq, NULL, diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index d7ec26b..34ca7b7 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -311,7 +311,8 @@ static int tipc_enable_bearer(struct net *net, const char *name, b->identity = bearer_id; b->tolerance = m->tolerance; - b->window = m->window; + b->min_win = m->min_win; + b->max_win = m->max_win; b->domain = disc_domain; b->net_plane = bearer_id + 'A'; b->priority = prio; @@ -796,7 +797,7 @@ static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg, goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_TOL, bearer->tolerance)) goto prop_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bearer->window)) + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bearer->max_win)) goto prop_msg_full; if (bearer->media->type_id == TIPC_MEDIA_TYPE_UDP) if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, bearer->mtu)) @@ -1088,7 +1089,7 @@ int __tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) if (props[TIPC_NLA_PROP_PRIO]) b->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); if (props[TIPC_NLA_PROP_WIN]) - b->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + b->max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); if (props[TIPC_NLA_PROP_MTU]) { if (b->media->type_id != TIPC_MEDIA_TYPE_UDP) return -EINVAL; @@ -1142,7 +1143,7 @@ static int __tipc_nl_add_media(struct tipc_nl_msg *msg, goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_TOL, media->tolerance)) goto prop_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, media->window)) + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, media->max_win)) goto prop_msg_full; if (media->type_id == TIPC_MEDIA_TYPE_UDP) if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, media->mtu)) @@ -1275,7 +1276,7 @@ int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info) if (props[TIPC_NLA_PROP_PRIO]) m->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); if (props[TIPC_NLA_PROP_WIN]) - m->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + m->max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); if (props[TIPC_NLA_PROP_MTU]) { if (m->type_id != TIPC_MEDIA_TYPE_UDP) return -EINVAL; diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index d0c79cc..bc00231 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -119,7 +119,8 @@ struct tipc_media { char *raw); u32 priority; u32 tolerance; - u32 window; + u32 min_win; + u32 max_win; u32 mtu; u32 type_id; u32 hwaddr_len; @@ -158,7 +159,8 @@ struct tipc_bearer { struct packet_type pt; struct rcu_head rcu; u32 priority; - u32 window; + u32 min_win; + u32 max_win; u32 tolerance; u32 domain; u32 identity; diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c index f69a2fd..38cdcab 100644 --- a/net/tipc/eth_media.c +++ b/net/tipc/eth_media.c @@ -37,6 +37,9 @@ #include "core.h" #include "bearer.h" +#define TIPC_MIN_ETH_LINK_WIN 50 +#define TIPC_MAX_ETH_LINK_WIN 500 + /* Convert Ethernet address (media address format) to string */ static int tipc_eth_addr2str(struct tipc_media_addr *addr, char *strbuf, int bufsz) @@ -92,7 +95,8 @@ struct tipc_media eth_media_info = { .raw2addr = tipc_eth_raw2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, - .window = TIPC_DEF_LINK_WIN, + .min_win = TIPC_MIN_ETH_LINK_WIN, + .max_win = TIPC_MAX_ETH_LINK_WIN, .type_id = TIPC_MEDIA_TYPE_ETH, .hwaddr_len = ETH_ALEN, .name = "eth" diff --git a/net/tipc/ib_media.c b/net/tipc/ib_media.c index e8c1671..7aa9ff8 100644 --- a/net/tipc/ib_media.c +++ b/net/tipc/ib_media.c @@ -42,6 +42,8 @@ #include "core.h" #include "bearer.h" +#define TIPC_MAX_IB_LINK_WIN 500 + /* convert InfiniBand address (media address format) media address to string */ static int tipc_ib_addr2str(struct tipc_media_addr *a, char *str_buf, int str_size) @@ -94,7 +96,8 @@ struct tipc_media ib_media_info = { .raw2addr = tipc_ib_raw2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, - .window = TIPC_DEF_LINK_WIN, + .min_win = TIPC_DEF_LINK_WIN, + .max_win = TIPC_MAX_IB_LINK_WIN, .type_id = TIPC_MEDIA_TYPE_IB, .hwaddr_len = INFINIBAND_ALEN, .name = "ib" diff --git a/net/tipc/link.c b/net/tipc/link.c index fb72031..a88950b 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -164,7 +164,6 @@ struct tipc_link { struct sk_buff *target_bskb; } backlog[5]; u16 snd_nxt; - u16 window; /* Reception */ u16 rcv_nxt; @@ -175,6 +174,10 @@ struct tipc_link { /* Congestion handling */ struct sk_buff_head wakeupq; + u16 window; + u16 min_win; + u16 max_win; + u16 cong_acks; /* Fragmentation/reassembly */ struct sk_buff *reasm_buf; @@ -308,9 +311,14 @@ u32 tipc_link_id(struct tipc_link *l) return l->peer_bearer_id << 16 | l->bearer_id; } -int tipc_link_window(struct tipc_link *l) +int tipc_link_min_win(struct tipc_link *l) +{ + return l->min_win; +} + +int tipc_link_max_win(struct tipc_link *l) { - return l->window; + return l->max_win; } int tipc_link_prio(struct tipc_link *l) @@ -436,7 +444,8 @@ u32 tipc_link_state(struct tipc_link *l) * @net_plane: network plane (A,B,c..) this link belongs to * @mtu: mtu to be advertised by link * @priority: priority to be used by link - * @window: send window to be used by link + * @min_win: minimal send window to be used by link + * @max_win: maximal send window to be used by link * @session: session to be used by link * @ownnode: identity of own node * @peer: node id of peer node @@ -451,7 +460,7 @@ u32 tipc_link_state(struct tipc_link *l) */ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, int tolerance, char net_plane, u32 mtu, int priority, - int window, u32 session, u32 self, + u32 min_win, u32 max_win, u32 session, u32 self, u32 peer, u8 *peer_id, u16 peer_caps, struct tipc_link *bc_sndlink, struct tipc_link *bc_rcvlink, @@ -495,7 +504,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, l->advertised_mtu = mtu; l->mtu = mtu; l->priority = priority; - tipc_link_set_queue_limits(l, window); + tipc_link_set_queue_limits(l, min_win, max_win); l->ackers = 1; l->bc_sndlink = bc_sndlink; l->bc_rcvlink = bc_rcvlink; @@ -523,7 +532,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, * Returns true if link was created, otherwise false */ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, - int mtu, int window, u16 peer_caps, + int mtu, u32 min_win, u32 max_win, u16 peer_caps, struct sk_buff_head *inputq, struct sk_buff_head *namedq, struct tipc_link *bc_sndlink, @@ -531,9 +540,9 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, { struct tipc_link *l; - if (!tipc_link_create(net, "", MAX_BEARERS, 0, 'Z', mtu, 0, window, - 0, ownnode, peer, NULL, peer_caps, bc_sndlink, - NULL, inputq, namedq, link)) + if (!tipc_link_create(net, "", MAX_BEARERS, 0, 'Z', mtu, 0, min_win, + max_win, 0, ownnode, peer, NULL, peer_caps, + bc_sndlink, NULL, inputq, namedq, link)) return false; l = *link; @@ -959,7 +968,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, int pkt_cnt = skb_queue_len(list); int imp = msg_importance(hdr); unsigned int mss = tipc_link_mss(l); - unsigned int maxwin = l->window; + unsigned int cwin = l->window; unsigned int mtu = l->mtu; bool new_bundle; int rc = 0; @@ -988,7 +997,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, /* Prepare each packet for sending, and add to relevant queue: */ while ((skb = __skb_dequeue(list))) { - if (likely(skb_queue_len(transmq) < maxwin)) { + if (likely(skb_queue_len(transmq) < cwin)) { hdr = buf_msg(skb); msg_set_seqno(hdr, seqno); msg_set_ack(hdr, ack); @@ -1038,6 +1047,8 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, static void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq) { + struct sk_buff_head *txq = &l->transmq; + u16 qlen, add, cwin = l->window; struct sk_buff *skb, *_skb; struct tipc_msg *hdr; u16 seqno = l->snd_nxt; @@ -1045,7 +1056,13 @@ static void tipc_link_advance_backlog(struct tipc_link *l, u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; u32 imp; - while (skb_queue_len(&l->transmq) < l->window) { + qlen = skb_queue_len(txq); + if (qlen >= cwin && (l->snd_nxt - buf_seqno(skb_peek(txq)) == qlen)) { + add = l->cong_acks++ % 32 ? 0 : 1; + cwin = min_t(u16, cwin + add, l->max_win); + l->window = cwin; + } + while (skb_queue_len(txq) < cwin) { skb = skb_peek(&l->backlogq); if (!skb) break; @@ -1140,6 +1157,7 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, { struct sk_buff *_skb, *skb = skb_peek(&l->transmq); u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; + bool retransmitted = false; u16 ack = l->rcv_nxt - 1; struct tipc_msg *hdr; int rc = 0; @@ -1173,11 +1191,13 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, _skb->priority = TC_PRIO_CONTROL; __skb_queue_tail(xmitq, _skb); l->stats.retransmitted++; - + retransmitted = true; /* Increase actual retrans counter & mark first time */ if (!TIPC_SKB_CB(skb)->retr_cnt++) TIPC_SKB_CB(skb)->retr_stamp = jiffies; } + if (retransmitted) + l->window = l->min_win + (l->window - l->min_win) / 2; return 0; } @@ -1417,6 +1437,7 @@ static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, struct sk_buff *skb, *_skb, *tmp; struct tipc_msg *hdr; u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; + bool retransmitted = false; u16 ack = l->rcv_nxt - 1; bool passed = false; u16 seqno, n = 0; @@ -1449,7 +1470,7 @@ static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, _skb->priority = TC_PRIO_CONTROL; __skb_queue_tail(xmitq, _skb); l->stats.retransmitted++; - + retransmitted = true; /* Increase actual retrans counter & mark first time */ if (!TIPC_SKB_CB(skb)->retr_cnt++) TIPC_SKB_CB(skb)->retr_stamp = jiffies; @@ -1463,7 +1484,8 @@ static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, goto next_gap_ack; } } - + if (retransmitted) + l->window = l->min_win + (l->window - l->min_win) / 2; return 0; } @@ -2305,15 +2327,17 @@ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb, return 0; } -void tipc_link_set_queue_limits(struct tipc_link *l, u32 win) +void tipc_link_set_queue_limits(struct tipc_link *l, u32 min_win, u32 max_win) { int max_bulk = TIPC_MAX_PUBL / (l->mtu / ITEM_SIZE); - l->window = win; - l->backlog[TIPC_LOW_IMPORTANCE].limit = max_t(u16, 50, win); - l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = max_t(u16, 100, win * 2); - l->backlog[TIPC_HIGH_IMPORTANCE].limit = max_t(u16, 150, win * 3); - l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = max_t(u16, 200, win * 4); + l->window = min_win; + l->min_win = min_win; + l->max_win = max_win; + l->backlog[TIPC_LOW_IMPORTANCE].limit = min_win * 2; + l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = min_win * 4; + l->backlog[TIPC_HIGH_IMPORTANCE].limit = min_win * 6; + l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = min_win * 8; l->backlog[TIPC_SYSTEM_IMPORTANCE].limit = max_bulk; } @@ -2366,10 +2390,10 @@ int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]) } if (props[TIPC_NLA_PROP_WIN]) { - u32 win; + u32 max_win; - win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); - if ((win < TIPC_MIN_LINK_WIN) || (win > TIPC_MAX_LINK_WIN)) + max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + if (max_win < TIPC_MIN_LINK_WIN || max_win > TIPC_MAX_LINK_WIN) return -EINVAL; } @@ -2605,7 +2629,7 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg) prop = nla_nest_start_noflag(msg->skb, TIPC_NLA_LINK_PROP); if (!prop) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->window)) + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->max_win)) goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_BROADCAST, bc_mode)) goto prop_msg_full; diff --git a/net/tipc/link.h b/net/tipc/link.h index c09e9d4..d3c1c3f 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -73,7 +73,7 @@ enum { bool tipc_link_create(struct net *net, char *if_name, int bearer_id, int tolerance, char net_plane, u32 mtu, int priority, - int window, u32 session, u32 ownnode, + u32 min_win, u32 max_win, u32 session, u32 ownnode, u32 peer, u8 *peer_id, u16 peer_caps, struct tipc_link *bc_sndlink, struct tipc_link *bc_rcvlink, @@ -81,7 +81,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, struct sk_buff_head *namedq, struct tipc_link **link); bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, - int mtu, int window, u16 peer_caps, + int mtu, u32 min_win, u32 max_win, u16 peer_caps, struct sk_buff_head *inputq, struct sk_buff_head *namedq, struct tipc_link *bc_sndlink, @@ -115,7 +115,8 @@ char *tipc_link_name_ext(struct tipc_link *l, char *buf); u32 tipc_link_state(struct tipc_link *l); char tipc_link_plane(struct tipc_link *l); int tipc_link_prio(struct tipc_link *l); -int tipc_link_window(struct tipc_link *l); +int tipc_link_min_win(struct tipc_link *l); +int tipc_link_max_win(struct tipc_link *l); void tipc_link_update_caps(struct tipc_link *l, u16 capabilities); bool tipc_link_validate_msg(struct tipc_link *l, struct tipc_msg *hdr); unsigned long tipc_link_tolerance(struct tipc_link *l); @@ -124,7 +125,7 @@ void tipc_link_set_tolerance(struct tipc_link *l, u32 tol, void tipc_link_set_prio(struct tipc_link *l, u32 prio, struct sk_buff_head *xmitq); void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit); -void tipc_link_set_queue_limits(struct tipc_link *l, u32 window); +void tipc_link_set_queue_limits(struct tipc_link *l, u32 min_win, u32 max_win); int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, struct tipc_link *link, int nlflags); int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]); diff --git a/net/tipc/node.c b/net/tipc/node.c index aaf5956..970c780 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1134,7 +1134,8 @@ void tipc_node_check_dest(struct net *net, u32 addr, snd_l = tipc_bc_sndlink(net); if (!tipc_link_bc_create(net, tipc_own_addr(net), addr, U16_MAX, - tipc_link_window(snd_l), + tipc_link_min_win(snd_l), + tipc_link_max_win(snd_l), n->capabilities, &n->bc_entry.inputq1, &n->bc_entry.namedq, snd_l, @@ -1228,7 +1229,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, get_random_bytes(&session, sizeof(u16)); if (!tipc_link_create(net, if_name, b->identity, b->tolerance, b->net_plane, b->mtu, b->priority, - b->window, session, + b->min_win, b->max_win, session, tipc_own_addr(net), addr, peer_id, n->capabilities, tipc_bc_sndlink(n->net), n->bc_entry.link, @@ -2374,10 +2375,12 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) tipc_link_set_prio(link, prio, &xmitq); } if (props[TIPC_NLA_PROP_WIN]) { - u32 win; + u32 max_win; - win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); - tipc_link_set_queue_limits(link, win); + max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + tipc_link_set_queue_limits(link, + tipc_link_min_win(link), + max_win); } } diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 86aaa4d..7d9dcbb 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -825,7 +825,8 @@ struct tipc_media udp_media_info = { .msg2addr = tipc_udp_msg2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, - .window = TIPC_DEF_LINK_WIN, + .min_win = TIPC_DEF_LINK_WIN, + .max_win = TIPC_DEF_LINK_WIN, .mtu = TIPC_DEF_LINK_UDP_MTU, .type_id = TIPC_MEDIA_TYPE_UDP, .hwaddr_len = 0, -- 2.1.4 |
From: Tuong L. T. <tuo...@de...> - 2019-11-19 01:57:22
|
Hi Jon, @@ -1045,7 +1056,13 @@ static void tipc_link_advance_backlog(struct tipc_link *l, u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; u32 imp; - while (skb_queue_len(&l->transmq) < l->window) { + qlen = skb_queue_len(txq); + if (qlen >= cwin && (l->snd_nxt - buf_seqno(skb_peek(txq)) == qlen)) { [Tuong]: now it should be: 'qlen + backlogq_len >= cwin' instead since we might have released some packets from the transmq just before calling this function...? + add = l->cong_acks++ % 32 ? 0 : 1; + cwin = min_t(u16, cwin + add, l->max_win); + l->window = cwin; + } + while (skb_queue_len(txq) < cwin) { BR/Tuong -----Original Message----- From: Jon Maloy <jon...@er...> Sent: Tuesday, November 19, 2019 6:33 AM To: Jon Maloy <jon...@er...>; Jon Maloy <ma...@do...> Cc: moh...@er...; par...@gm...; tun...@de...; hoa...@de...; tuo...@de...; gor...@de...; yin...@wi...; tip...@li... Subject: [net-next v3 1/1] tipc: introduce variable window congestion control We introduce a simple variable window congestion control for links. The algorithm is inspired by the Reno algorithm, and can best be descibed as working in permanent "congestion avoidance" mode, within strict limits. - We introduce hard lower and upper window limits per link, still different and configurable per bearer type. - Next, we let a link start at the minimum window, and then slowly increment it for each 32 received non-duplicate ACK. This goes on until it either reaches the upper limit, or until it receives a NACK message. - For each non-duplicate NACK received, we let the window decrease by intervals of 1/2 of the current window, but not below the minimum window. The change does in reality have effect only on unicast ethernet transport, as we have seen that there is no room whatsoever for increasing the window max size for the UDP bearer. For now, we also choose to keep the limits for the broadcast link unchanged and equal. This algorithm seems to give a ~25% throughput improvement for large messages, while it has no effect on throughput for small messages. Suggested-by: Xin Long <luc...@gm...> Acked-by: Xin Long <luc...@gm...> Signed-off-by: Jon Maloy <jon...@er...> --- v2: - Moved window increment in tipc_advance_backlogq() to before the transfer loop, as suggested Tuong. - Introduced logic for incrementing the window even for the broadcast send link, also suggested by Tuong. v3: - Rebased to latest net-next --- net/tipc/bcast.c | 11 ++++---- net/tipc/bearer.c | 11 ++++---- net/tipc/bearer.h | 6 +++-- net/tipc/eth_media.c | 6 ++++- net/tipc/ib_media.c | 5 +++- net/tipc/link.c | 76 ++++++++++++++++++++++++++++++++++------------------ net/tipc/link.h | 9 ++++--- net/tipc/node.c | 13 +++++---- net/tipc/udp_media.c | 3 ++- 9 files changed, 90 insertions(+), 50 deletions(-) diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index f41096a..84da317 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -562,18 +562,18 @@ int tipc_bclink_reset_stats(struct net *net) return 0; } -static int tipc_bc_link_set_queue_limits(struct net *net, u32 limit) +static int tipc_bc_link_set_queue_limits(struct net *net, u32 max_win) { struct tipc_link *l = tipc_bc_sndlink(net); if (!l) return -ENOPROTOOPT; - if (limit < BCLINK_WIN_MIN) - limit = BCLINK_WIN_MIN; - if (limit > TIPC_MAX_LINK_WIN) + if (max_win < BCLINK_WIN_MIN) + max_win = BCLINK_WIN_MIN; + if (max_win > TIPC_MAX_LINK_WIN) return -EINVAL; tipc_bcast_lock(net); - tipc_link_set_queue_limits(l, limit); + tipc_link_set_queue_limits(l, BCLINK_WIN_MIN, max_win); tipc_bcast_unlock(net); return 0; } @@ -683,6 +683,7 @@ int tipc_bcast_init(struct net *net) if (!tipc_link_bc_create(net, 0, 0, FB_MTU, BCLINK_WIN_DEFAULT, + BCLINK_WIN_DEFAULT, 0, &bb->inputq, NULL, diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index d7ec26b..34ca7b7 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -311,7 +311,8 @@ static int tipc_enable_bearer(struct net *net, const char *name, b->identity = bearer_id; b->tolerance = m->tolerance; - b->window = m->window; + b->min_win = m->min_win; + b->max_win = m->max_win; b->domain = disc_domain; b->net_plane = bearer_id + 'A'; b->priority = prio; @@ -796,7 +797,7 @@ static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg, goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_TOL, bearer->tolerance)) goto prop_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bearer->window)) + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bearer->max_win)) goto prop_msg_full; if (bearer->media->type_id == TIPC_MEDIA_TYPE_UDP) if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, bearer->mtu)) @@ -1088,7 +1089,7 @@ int __tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) if (props[TIPC_NLA_PROP_PRIO]) b->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); if (props[TIPC_NLA_PROP_WIN]) - b->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + b->max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); if (props[TIPC_NLA_PROP_MTU]) { if (b->media->type_id != TIPC_MEDIA_TYPE_UDP) return -EINVAL; @@ -1142,7 +1143,7 @@ static int __tipc_nl_add_media(struct tipc_nl_msg *msg, goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_TOL, media->tolerance)) goto prop_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, media->window)) + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, media->max_win)) goto prop_msg_full; if (media->type_id == TIPC_MEDIA_TYPE_UDP) if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, media->mtu)) @@ -1275,7 +1276,7 @@ int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info) if (props[TIPC_NLA_PROP_PRIO]) m->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); if (props[TIPC_NLA_PROP_WIN]) - m->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + m->max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); if (props[TIPC_NLA_PROP_MTU]) { if (m->type_id != TIPC_MEDIA_TYPE_UDP) return -EINVAL; diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index d0c79cc..bc00231 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -119,7 +119,8 @@ struct tipc_media { char *raw); u32 priority; u32 tolerance; - u32 window; + u32 min_win; + u32 max_win; u32 mtu; u32 type_id; u32 hwaddr_len; @@ -158,7 +159,8 @@ struct tipc_bearer { struct packet_type pt; struct rcu_head rcu; u32 priority; - u32 window; + u32 min_win; + u32 max_win; u32 tolerance; u32 domain; u32 identity; diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c index f69a2fd..38cdcab 100644 --- a/net/tipc/eth_media.c +++ b/net/tipc/eth_media.c @@ -37,6 +37,9 @@ #include "core.h" #include "bearer.h" +#define TIPC_MIN_ETH_LINK_WIN 50 +#define TIPC_MAX_ETH_LINK_WIN 500 + /* Convert Ethernet address (media address format) to string */ static int tipc_eth_addr2str(struct tipc_media_addr *addr, char *strbuf, int bufsz) @@ -92,7 +95,8 @@ struct tipc_media eth_media_info = { .raw2addr = tipc_eth_raw2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, - .window = TIPC_DEF_LINK_WIN, + .min_win = TIPC_MIN_ETH_LINK_WIN, + .max_win = TIPC_MAX_ETH_LINK_WIN, .type_id = TIPC_MEDIA_TYPE_ETH, .hwaddr_len = ETH_ALEN, .name = "eth" diff --git a/net/tipc/ib_media.c b/net/tipc/ib_media.c index e8c1671..7aa9ff8 100644 --- a/net/tipc/ib_media.c +++ b/net/tipc/ib_media.c @@ -42,6 +42,8 @@ #include "core.h" #include "bearer.h" +#define TIPC_MAX_IB_LINK_WIN 500 + /* convert InfiniBand address (media address format) media address to string */ static int tipc_ib_addr2str(struct tipc_media_addr *a, char *str_buf, int str_size) @@ -94,7 +96,8 @@ struct tipc_media ib_media_info = { .raw2addr = tipc_ib_raw2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, - .window = TIPC_DEF_LINK_WIN, + .min_win = TIPC_DEF_LINK_WIN, + .max_win = TIPC_MAX_IB_LINK_WIN, .type_id = TIPC_MEDIA_TYPE_IB, .hwaddr_len = INFINIBAND_ALEN, .name = "ib" diff --git a/net/tipc/link.c b/net/tipc/link.c index fb72031..a88950b 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -164,7 +164,6 @@ struct tipc_link { struct sk_buff *target_bskb; } backlog[5]; u16 snd_nxt; - u16 window; /* Reception */ u16 rcv_nxt; @@ -175,6 +174,10 @@ struct tipc_link { /* Congestion handling */ struct sk_buff_head wakeupq; + u16 window; + u16 min_win; + u16 max_win; + u16 cong_acks; /* Fragmentation/reassembly */ struct sk_buff *reasm_buf; @@ -308,9 +311,14 @@ u32 tipc_link_id(struct tipc_link *l) return l->peer_bearer_id << 16 | l->bearer_id; } -int tipc_link_window(struct tipc_link *l) +int tipc_link_min_win(struct tipc_link *l) +{ + return l->min_win; +} + +int tipc_link_max_win(struct tipc_link *l) { - return l->window; + return l->max_win; } int tipc_link_prio(struct tipc_link *l) @@ -436,7 +444,8 @@ u32 tipc_link_state(struct tipc_link *l) * @net_plane: network plane (A,B,c..) this link belongs to * @mtu: mtu to be advertised by link * @priority: priority to be used by link - * @window: send window to be used by link + * @min_win: minimal send window to be used by link + * @max_win: maximal send window to be used by link * @session: session to be used by link * @ownnode: identity of own node * @peer: node id of peer node @@ -451,7 +460,7 @@ u32 tipc_link_state(struct tipc_link *l) */ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, int tolerance, char net_plane, u32 mtu, int priority, - int window, u32 session, u32 self, + u32 min_win, u32 max_win, u32 session, u32 self, u32 peer, u8 *peer_id, u16 peer_caps, struct tipc_link *bc_sndlink, struct tipc_link *bc_rcvlink, @@ -495,7 +504,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, l->advertised_mtu = mtu; l->mtu = mtu; l->priority = priority; - tipc_link_set_queue_limits(l, window); + tipc_link_set_queue_limits(l, min_win, max_win); l->ackers = 1; l->bc_sndlink = bc_sndlink; l->bc_rcvlink = bc_rcvlink; @@ -523,7 +532,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, * Returns true if link was created, otherwise false */ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, - int mtu, int window, u16 peer_caps, + int mtu, u32 min_win, u32 max_win, u16 peer_caps, struct sk_buff_head *inputq, struct sk_buff_head *namedq, struct tipc_link *bc_sndlink, @@ -531,9 +540,9 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, { struct tipc_link *l; - if (!tipc_link_create(net, "", MAX_BEARERS, 0, 'Z', mtu, 0, window, - 0, ownnode, peer, NULL, peer_caps, bc_sndlink, - NULL, inputq, namedq, link)) + if (!tipc_link_create(net, "", MAX_BEARERS, 0, 'Z', mtu, 0, min_win, + max_win, 0, ownnode, peer, NULL, peer_caps, + bc_sndlink, NULL, inputq, namedq, link)) return false; l = *link; @@ -959,7 +968,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, int pkt_cnt = skb_queue_len(list); int imp = msg_importance(hdr); unsigned int mss = tipc_link_mss(l); - unsigned int maxwin = l->window; + unsigned int cwin = l->window; unsigned int mtu = l->mtu; bool new_bundle; int rc = 0; @@ -988,7 +997,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, /* Prepare each packet for sending, and add to relevant queue: */ while ((skb = __skb_dequeue(list))) { - if (likely(skb_queue_len(transmq) < maxwin)) { + if (likely(skb_queue_len(transmq) < cwin)) { hdr = buf_msg(skb); msg_set_seqno(hdr, seqno); msg_set_ack(hdr, ack); @@ -1038,6 +1047,8 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, static void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq) { + struct sk_buff_head *txq = &l->transmq; + u16 qlen, add, cwin = l->window; struct sk_buff *skb, *_skb; struct tipc_msg *hdr; u16 seqno = l->snd_nxt; @@ -1045,7 +1056,13 @@ static void tipc_link_advance_backlog(struct tipc_link *l, u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; u32 imp; - while (skb_queue_len(&l->transmq) < l->window) { + qlen = skb_queue_len(txq); + if (qlen >= cwin && (l->snd_nxt - buf_seqno(skb_peek(txq)) == qlen)) { + add = l->cong_acks++ % 32 ? 0 : 1; + cwin = min_t(u16, cwin + add, l->max_win); + l->window = cwin; + } + while (skb_queue_len(txq) < cwin) { skb = skb_peek(&l->backlogq); if (!skb) break; @@ -1140,6 +1157,7 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, { struct sk_buff *_skb, *skb = skb_peek(&l->transmq); u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; + bool retransmitted = false; u16 ack = l->rcv_nxt - 1; struct tipc_msg *hdr; int rc = 0; @@ -1173,11 +1191,13 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, _skb->priority = TC_PRIO_CONTROL; __skb_queue_tail(xmitq, _skb); l->stats.retransmitted++; - + retransmitted = true; /* Increase actual retrans counter & mark first time */ if (!TIPC_SKB_CB(skb)->retr_cnt++) TIPC_SKB_CB(skb)->retr_stamp = jiffies; } + if (retransmitted) + l->window = l->min_win + (l->window - l->min_win) / 2; return 0; } @@ -1417,6 +1437,7 @@ static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, struct sk_buff *skb, *_skb, *tmp; struct tipc_msg *hdr; u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; + bool retransmitted = false; u16 ack = l->rcv_nxt - 1; bool passed = false; u16 seqno, n = 0; @@ -1449,7 +1470,7 @@ static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, _skb->priority = TC_PRIO_CONTROL; __skb_queue_tail(xmitq, _skb); l->stats.retransmitted++; - + retransmitted = true; /* Increase actual retrans counter & mark first time */ if (!TIPC_SKB_CB(skb)->retr_cnt++) TIPC_SKB_CB(skb)->retr_stamp = jiffies; @@ -1463,7 +1484,8 @@ static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, goto next_gap_ack; } } - + if (retransmitted) + l->window = l->min_win + (l->window - l->min_win) / 2; return 0; } @@ -2305,15 +2327,17 @@ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb, return 0; } -void tipc_link_set_queue_limits(struct tipc_link *l, u32 win) +void tipc_link_set_queue_limits(struct tipc_link *l, u32 min_win, u32 max_win) { int max_bulk = TIPC_MAX_PUBL / (l->mtu / ITEM_SIZE); - l->window = win; - l->backlog[TIPC_LOW_IMPORTANCE].limit = max_t(u16, 50, win); - l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = max_t(u16, 100, win * 2); - l->backlog[TIPC_HIGH_IMPORTANCE].limit = max_t(u16, 150, win * 3); - l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = max_t(u16, 200, win * 4); + l->window = min_win; + l->min_win = min_win; + l->max_win = max_win; + l->backlog[TIPC_LOW_IMPORTANCE].limit = min_win * 2; + l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = min_win * 4; + l->backlog[TIPC_HIGH_IMPORTANCE].limit = min_win * 6; + l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = min_win * 8; l->backlog[TIPC_SYSTEM_IMPORTANCE].limit = max_bulk; } @@ -2366,10 +2390,10 @@ int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]) } if (props[TIPC_NLA_PROP_WIN]) { - u32 win; + u32 max_win; - win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); - if ((win < TIPC_MIN_LINK_WIN) || (win > TIPC_MAX_LINK_WIN)) + max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + if (max_win < TIPC_MIN_LINK_WIN || max_win > TIPC_MAX_LINK_WIN) return -EINVAL; } @@ -2605,7 +2629,7 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg) prop = nla_nest_start_noflag(msg->skb, TIPC_NLA_LINK_PROP); if (!prop) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->window)) + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->max_win)) goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_BROADCAST, bc_mode)) goto prop_msg_full; diff --git a/net/tipc/link.h b/net/tipc/link.h index c09e9d4..d3c1c3f 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -73,7 +73,7 @@ enum { bool tipc_link_create(struct net *net, char *if_name, int bearer_id, int tolerance, char net_plane, u32 mtu, int priority, - int window, u32 session, u32 ownnode, + u32 min_win, u32 max_win, u32 session, u32 ownnode, u32 peer, u8 *peer_id, u16 peer_caps, struct tipc_link *bc_sndlink, struct tipc_link *bc_rcvlink, @@ -81,7 +81,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, struct sk_buff_head *namedq, struct tipc_link **link); bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, - int mtu, int window, u16 peer_caps, + int mtu, u32 min_win, u32 max_win, u16 peer_caps, struct sk_buff_head *inputq, struct sk_buff_head *namedq, struct tipc_link *bc_sndlink, @@ -115,7 +115,8 @@ char *tipc_link_name_ext(struct tipc_link *l, char *buf); u32 tipc_link_state(struct tipc_link *l); char tipc_link_plane(struct tipc_link *l); int tipc_link_prio(struct tipc_link *l); -int tipc_link_window(struct tipc_link *l); +int tipc_link_min_win(struct tipc_link *l); +int tipc_link_max_win(struct tipc_link *l); void tipc_link_update_caps(struct tipc_link *l, u16 capabilities); bool tipc_link_validate_msg(struct tipc_link *l, struct tipc_msg *hdr); unsigned long tipc_link_tolerance(struct tipc_link *l); @@ -124,7 +125,7 @@ void tipc_link_set_tolerance(struct tipc_link *l, u32 tol, void tipc_link_set_prio(struct tipc_link *l, u32 prio, struct sk_buff_head *xmitq); void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit); -void tipc_link_set_queue_limits(struct tipc_link *l, u32 window); +void tipc_link_set_queue_limits(struct tipc_link *l, u32 min_win, u32 max_win); int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, struct tipc_link *link, int nlflags); int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]); diff --git a/net/tipc/node.c b/net/tipc/node.c index aaf5956..970c780 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1134,7 +1134,8 @@ void tipc_node_check_dest(struct net *net, u32 addr, snd_l = tipc_bc_sndlink(net); if (!tipc_link_bc_create(net, tipc_own_addr(net), addr, U16_MAX, - tipc_link_window(snd_l), + tipc_link_min_win(snd_l), + tipc_link_max_win(snd_l), n->capabilities, &n->bc_entry.inputq1, &n->bc_entry.namedq, snd_l, @@ -1228,7 +1229,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, get_random_bytes(&session, sizeof(u16)); if (!tipc_link_create(net, if_name, b->identity, b->tolerance, b->net_plane, b->mtu, b->priority, - b->window, session, + b->min_win, b->max_win, session, tipc_own_addr(net), addr, peer_id, n->capabilities, tipc_bc_sndlink(n->net), n->bc_entry.link, @@ -2374,10 +2375,12 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) tipc_link_set_prio(link, prio, &xmitq); } if (props[TIPC_NLA_PROP_WIN]) { - u32 win; + u32 max_win; - win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); - tipc_link_set_queue_limits(link, win); + max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + tipc_link_set_queue_limits(link, + tipc_link_min_win(link), + max_win); } } diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 86aaa4d..7d9dcbb 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -825,7 +825,8 @@ struct tipc_media udp_media_info = { .msg2addr = tipc_udp_msg2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, - .window = TIPC_DEF_LINK_WIN, + .min_win = TIPC_DEF_LINK_WIN, + .max_win = TIPC_DEF_LINK_WIN, .mtu = TIPC_DEF_LINK_UDP_MTU, .type_id = TIPC_MEDIA_TYPE_UDP, .hwaddr_len = 0, -- 2.1.4 |
From: Jon M. <jon...@er...> - 2019-11-18 22:29:08
|
Ying, Tuong and others, I would like to get this in before net-next 5.4 closes, and we are at rc7 now. Please review and ack if you are ok with this. There is probably some room for improvements here, but we can do that later. This is still good step forward. BR ///jon -----Original Message----- From: Jon Maloy <jon...@er...> Sent: 8-Nov-19 12:36 To: Jon Maloy <jon...@er...>; Jon Maloy <ma...@do...> Cc: Mohan Krishna Ghanta Krishnamurthy <moh...@er...>; par...@gm...; Tung Quang Nguyen <tun...@de...>; Hoang Huu Le <hoa...@de...>; Tuong Tong Lien <tuo...@de...>; Gordan Mihaljevic <gor...@de...>; yin...@wi...; tip...@li... Subject: [net-next v2 1/1] tipc: introduce variable window congestion control We introduce a simple variable window congestion control for links. The algorithm is inspired by the Reno algorithm, and can best be descibed as working in permanent "congestion avoidance" mode, within strict limits. - We introduce hard lower and upper window limits per link, still different and configurable per bearer type. - Next, we let a link start at the minimum window, and then slowly increment it for each 32 received non-duplicate ACK. This goes on until it either reaches the upper limit, or until it receives a NACK message. - For each non-duplicate NACK received, we let the window decrease by intervals of 1/2 of the current window, but not below the minimum window. The change does in reality have effect only on unicast ethernet transport, as we have seen that there is no room whatsoever for increasing the window max size for the UDP bearer. For now, we also choose to keep the limits for the broadcast link unchanged and equal. This algorithm seems to give a ~25% throughput improvement for large messages, while it has no effect on small message throughput. Suggested-by: Xin Long <luc...@gm...> Signed-off-by: Jon Maloy <jon...@er...> --- v2: - Moved window increment in tipc_advance_backlogq() to before the transfer loop, as suggested Tuong. - Introduced logic for incrementing the window even for the broadcast send link, also suggested by Tuong. --- net/tipc/bcast.c | 11 ++++---- net/tipc/bearer.c | 11 ++++---- net/tipc/bearer.h | 6 +++-- net/tipc/eth_media.c | 6 ++++- net/tipc/ib_media.c | 5 +++- net/tipc/link.c | 76 ++++++++++++++++++++++++++++++++++------------------ net/tipc/link.h | 9 ++++--- net/tipc/node.c | 13 +++++---- net/tipc/udp_media.c | 3 ++- 9 files changed, 90 insertions(+), 50 deletions(-) diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 6ef1abd..12fde9a 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -562,18 +562,18 @@ int tipc_bclink_reset_stats(struct net *net) return 0; } -static int tipc_bc_link_set_queue_limits(struct net *net, u32 limit) +static int tipc_bc_link_set_queue_limits(struct net *net, u32 max_win) { struct tipc_link *l = tipc_bc_sndlink(net); if (!l) return -ENOPROTOOPT; - if (limit < BCLINK_WIN_MIN) - limit = BCLINK_WIN_MIN; - if (limit > TIPC_MAX_LINK_WIN) + if (max_win < BCLINK_WIN_MIN) + max_win = BCLINK_WIN_MIN; + if (max_win > TIPC_MAX_LINK_WIN) return -EINVAL; tipc_bcast_lock(net); - tipc_link_set_queue_limits(l, limit); + tipc_link_set_queue_limits(l, BCLINK_WIN_MIN, max_win); tipc_bcast_unlock(net); return 0; } @@ -683,6 +683,7 @@ int tipc_bcast_init(struct net *net) if (!tipc_link_bc_create(net, 0, 0, FB_MTU, BCLINK_WIN_DEFAULT, + BCLINK_WIN_DEFAULT, 0, &bb->inputq, NULL, diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 0214aa1..f994961 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -310,7 +310,8 @@ static int tipc_enable_bearer(struct net *net, const char *name, b->identity = bearer_id; b->tolerance = m->tolerance; - b->window = m->window; + b->min_win = m->min_win; + b->max_win = m->max_win; b->domain = disc_domain; b->net_plane = bearer_id + 'A'; b->priority = prio; @@ -765,7 +766,7 @@ static int __tipc_nl_add_bearer(struct tipc_nl_msg *msg, goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_TOL, bearer->tolerance)) goto prop_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bearer->window)) + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bearer->max_win)) goto prop_msg_full; if (bearer->media->type_id == TIPC_MEDIA_TYPE_UDP) if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, bearer->mtu)) @@ -1057,7 +1058,7 @@ int __tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) if (props[TIPC_NLA_PROP_PRIO]) b->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); if (props[TIPC_NLA_PROP_WIN]) - b->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + b->max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); if (props[TIPC_NLA_PROP_MTU]) { if (b->media->type_id != TIPC_MEDIA_TYPE_UDP) return -EINVAL; @@ -1111,7 +1112,7 @@ static int __tipc_nl_add_media(struct tipc_nl_msg *msg, goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_TOL, media->tolerance)) goto prop_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, media->window)) + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, media->max_win)) goto prop_msg_full; if (media->type_id == TIPC_MEDIA_TYPE_UDP) if (nla_put_u32(msg->skb, TIPC_NLA_PROP_MTU, media->mtu)) @@ -1244,7 +1245,7 @@ int __tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info) if (props[TIPC_NLA_PROP_PRIO]) m->priority = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); if (props[TIPC_NLA_PROP_WIN]) - m->window = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + m->max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); if (props[TIPC_NLA_PROP_MTU]) { if (m->type_id != TIPC_MEDIA_TYPE_UDP) return -EINVAL; diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index ea0f3c4..58a23b9 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -119,7 +119,8 @@ struct tipc_media { char *raw); u32 priority; u32 tolerance; - u32 window; + u32 min_win; + u32 max_win; u32 mtu; u32 type_id; u32 hwaddr_len; @@ -158,7 +159,8 @@ struct tipc_bearer { struct packet_type pt; struct rcu_head rcu; u32 priority; - u32 window; + u32 min_win; + u32 max_win; u32 tolerance; u32 domain; u32 identity; diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c index f69a2fd..38cdcab 100644 --- a/net/tipc/eth_media.c +++ b/net/tipc/eth_media.c @@ -37,6 +37,9 @@ #include "core.h" #include "bearer.h" +#define TIPC_MIN_ETH_LINK_WIN 50 +#define TIPC_MAX_ETH_LINK_WIN 500 + /* Convert Ethernet address (media address format) to string */ static int tipc_eth_addr2str(struct tipc_media_addr *addr, char *strbuf, int bufsz) @@ -92,7 +95,8 @@ struct tipc_media eth_media_info = { .raw2addr = tipc_eth_raw2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, - .window = TIPC_DEF_LINK_WIN, + .min_win = TIPC_MIN_ETH_LINK_WIN, + .max_win = TIPC_MAX_ETH_LINK_WIN, .type_id = TIPC_MEDIA_TYPE_ETH, .hwaddr_len = ETH_ALEN, .name = "eth" diff --git a/net/tipc/ib_media.c b/net/tipc/ib_media.c index e8c1671..7aa9ff8 100644 --- a/net/tipc/ib_media.c +++ b/net/tipc/ib_media.c @@ -42,6 +42,8 @@ #include "core.h" #include "bearer.h" +#define TIPC_MAX_IB_LINK_WIN 500 + /* convert InfiniBand address (media address format) media address to string */ static int tipc_ib_addr2str(struct tipc_media_addr *a, char *str_buf, int str_size) @@ -94,7 +96,8 @@ struct tipc_media ib_media_info = { .raw2addr = tipc_ib_raw2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, - .window = TIPC_DEF_LINK_WIN, + .min_win = TIPC_DEF_LINK_WIN, + .max_win = TIPC_MAX_IB_LINK_WIN, .type_id = TIPC_MEDIA_TYPE_IB, .hwaddr_len = INFINIBAND_ALEN, .name = "ib" diff --git a/net/tipc/link.c b/net/tipc/link.c index 038861ba..7f657a6 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -163,7 +163,6 @@ struct tipc_link { struct sk_buff *target_bskb; } backlog[5]; u16 snd_nxt; - u16 window; /* Reception */ u16 rcv_nxt; @@ -174,6 +173,10 @@ struct tipc_link { /* Congestion handling */ struct sk_buff_head wakeupq; + u16 window; + u16 min_win; + u16 max_win; + u16 cong_acks; /* Fragmentation/reassembly */ struct sk_buff *reasm_buf; @@ -307,9 +310,14 @@ u32 tipc_link_id(struct tipc_link *l) return l->peer_bearer_id << 16 | l->bearer_id; } -int tipc_link_window(struct tipc_link *l) +int tipc_link_min_win(struct tipc_link *l) +{ + return l->min_win; +} + +int tipc_link_max_win(struct tipc_link *l) { - return l->window; + return l->max_win; } int tipc_link_prio(struct tipc_link *l) @@ -426,7 +434,8 @@ u32 tipc_link_state(struct tipc_link *l) * @net_plane: network plane (A,B,c..) this link belongs to * @mtu: mtu to be advertised by link * @priority: priority to be used by link - * @window: send window to be used by link + * @min_win: minimal send window to be used by link + * @max_win: maximal send window to be used by link * @session: session to be used by link * @ownnode: identity of own node * @peer: node id of peer node @@ -441,7 +450,7 @@ u32 tipc_link_state(struct tipc_link *l) */ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, int tolerance, char net_plane, u32 mtu, int priority, - int window, u32 session, u32 self, + u32 min_win, u32 max_win, u32 session, u32 self, u32 peer, u8 *peer_id, u16 peer_caps, struct tipc_link *bc_sndlink, struct tipc_link *bc_rcvlink, @@ -485,7 +494,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, l->advertised_mtu = mtu; l->mtu = mtu; l->priority = priority; - tipc_link_set_queue_limits(l, window); + tipc_link_set_queue_limits(l, min_win, max_win); l->ackers = 1; l->bc_sndlink = bc_sndlink; l->bc_rcvlink = bc_rcvlink; @@ -513,7 +522,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, * Returns true if link was created, otherwise false */ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, - int mtu, int window, u16 peer_caps, + int mtu, u32 min_win, u32 max_win, u16 peer_caps, struct sk_buff_head *inputq, struct sk_buff_head *namedq, struct tipc_link *bc_sndlink, @@ -521,9 +530,9 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, { struct tipc_link *l; - if (!tipc_link_create(net, "", MAX_BEARERS, 0, 'Z', mtu, 0, window, - 0, ownnode, peer, NULL, peer_caps, bc_sndlink, - NULL, inputq, namedq, link)) + if (!tipc_link_create(net, "", MAX_BEARERS, 0, 'Z', mtu, 0, min_win, + max_win, 0, ownnode, peer, NULL, peer_caps, + bc_sndlink, NULL, inputq, namedq, link)) return false; l = *link; @@ -948,7 +957,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, u16 seqno = l->snd_nxt; int pkt_cnt = skb_queue_len(list); int imp = msg_importance(hdr); - unsigned int maxwin = l->window; + unsigned int cwin = l->window; unsigned int mtu = l->mtu; bool new_bundle; int rc = 0; @@ -977,7 +986,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, /* Prepare each packet for sending, and add to relevant queue: */ while ((skb = __skb_dequeue(list))) { - if (likely(skb_queue_len(transmq) < maxwin)) { + if (likely(skb_queue_len(transmq) < cwin)) { hdr = buf_msg(skb); msg_set_seqno(hdr, seqno); msg_set_ack(hdr, ack); @@ -1028,6 +1037,8 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, static void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq) { + struct sk_buff_head *txq = &l->transmq; + u16 qlen, add, cwin = l->window; struct sk_buff *skb, *_skb; struct tipc_msg *hdr; u16 seqno = l->snd_nxt; @@ -1035,7 +1046,13 @@ static void tipc_link_advance_backlog(struct tipc_link *l, u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; u32 imp; - while (skb_queue_len(&l->transmq) < l->window) { + qlen = skb_queue_len(txq); + if (qlen >= cwin && (l->snd_nxt - buf_seqno(skb_peek(txq)) == qlen)) { + add = l->cong_acks++ % 32 ? 0 : 1; + cwin = min_t(u16, cwin + add, l->max_win); + l->window = cwin; + } + while (skb_queue_len(txq) < cwin) { skb = skb_peek(&l->backlogq); if (!skb) break; @@ -1130,6 +1147,7 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, { struct sk_buff *_skb, *skb = skb_peek(&l->transmq); u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; + bool retransmitted = false; u16 ack = l->rcv_nxt - 1; struct tipc_msg *hdr; int rc = 0; @@ -1163,11 +1181,13 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, _skb->priority = TC_PRIO_CONTROL; __skb_queue_tail(xmitq, _skb); l->stats.retransmitted++; - + retransmitted = true; /* Increase actual retrans counter & mark first time */ if (!TIPC_SKB_CB(skb)->retr_cnt++) TIPC_SKB_CB(skb)->retr_stamp = jiffies; } + if (retransmitted) + l->window = l->min_win + (l->window - l->min_win) / 2; return 0; } @@ -1407,6 +1427,7 @@ static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, struct sk_buff *skb, *_skb, *tmp; struct tipc_msg *hdr; u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; + bool retransmitted = false; u16 ack = l->rcv_nxt - 1; bool passed = false; u16 seqno, n = 0; @@ -1440,7 +1461,7 @@ static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, _skb->priority = TC_PRIO_CONTROL; __skb_queue_tail(xmitq, _skb); l->stats.retransmitted++; - + retransmitted = true; /* Increase actual retrans counter & mark first time */ if (!TIPC_SKB_CB(skb)->retr_cnt++) TIPC_SKB_CB(skb)->retr_stamp = jiffies; @@ -1454,7 +1475,8 @@ static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, goto next_gap_ack; } } - + if (retransmitted) + l->window = l->min_win + (l->window - l->min_win) / 2; return 0; } @@ -2297,15 +2319,17 @@ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb, return 0; } -void tipc_link_set_queue_limits(struct tipc_link *l, u32 win) +void tipc_link_set_queue_limits(struct tipc_link *l, u32 min_win, u32 max_win) { int max_bulk = TIPC_MAX_PUBL / (l->mtu / ITEM_SIZE); - l->window = win; - l->backlog[TIPC_LOW_IMPORTANCE].limit = max_t(u16, 50, win); - l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = max_t(u16, 100, win * 2); - l->backlog[TIPC_HIGH_IMPORTANCE].limit = max_t(u16, 150, win * 3); - l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = max_t(u16, 200, win * 4); + l->window = min_win; + l->min_win = min_win; + l->max_win = max_win; + l->backlog[TIPC_LOW_IMPORTANCE].limit = min_win * 2; + l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = min_win * 4; + l->backlog[TIPC_HIGH_IMPORTANCE].limit = min_win * 6; + l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = min_win * 8; l->backlog[TIPC_SYSTEM_IMPORTANCE].limit = max_bulk; } @@ -2358,10 +2382,10 @@ int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]) } if (props[TIPC_NLA_PROP_WIN]) { - u32 win; + u32 max_win; - win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); - if ((win < TIPC_MIN_LINK_WIN) || (win > TIPC_MAX_LINK_WIN)) + max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + if (max_win < TIPC_MIN_LINK_WIN || max_win > TIPC_MAX_LINK_WIN) return -EINVAL; } @@ -2597,7 +2621,7 @@ int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg) prop = nla_nest_start_noflag(msg->skb, TIPC_NLA_LINK_PROP); if (!prop) goto attr_msg_full; - if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->window)) + if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->max_win)) goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_BROADCAST, bc_mode)) goto prop_msg_full; diff --git a/net/tipc/link.h b/net/tipc/link.h index adcad65..caed071 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -73,7 +73,7 @@ enum { bool tipc_link_create(struct net *net, char *if_name, int bearer_id, int tolerance, char net_plane, u32 mtu, int priority, - int window, u32 session, u32 ownnode, + u32 min_win, u32 max_win, u32 session, u32 ownnode, u32 peer, u8 *peer_id, u16 peer_caps, struct tipc_link *bc_sndlink, struct tipc_link *bc_rcvlink, @@ -81,7 +81,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, struct sk_buff_head *namedq, struct tipc_link **link); bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, - int mtu, int window, u16 peer_caps, + int mtu, u32 min_win, u32 max_win, u16 peer_caps, struct sk_buff_head *inputq, struct sk_buff_head *namedq, struct tipc_link *bc_sndlink, @@ -115,7 +115,8 @@ char *tipc_link_name_ext(struct tipc_link *l, char *buf); u32 tipc_link_state(struct tipc_link *l); char tipc_link_plane(struct tipc_link *l); int tipc_link_prio(struct tipc_link *l); -int tipc_link_window(struct tipc_link *l); +int tipc_link_min_win(struct tipc_link *l); +int tipc_link_max_win(struct tipc_link *l); void tipc_link_update_caps(struct tipc_link *l, u16 capabilities); bool tipc_link_validate_msg(struct tipc_link *l, struct tipc_msg *hdr); unsigned long tipc_link_tolerance(struct tipc_link *l); @@ -124,7 +125,7 @@ void tipc_link_set_tolerance(struct tipc_link *l, u32 tol, void tipc_link_set_prio(struct tipc_link *l, u32 prio, struct sk_buff_head *xmitq); void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit); -void tipc_link_set_queue_limits(struct tipc_link *l, u32 window); +void tipc_link_set_queue_limits(struct tipc_link *l, u32 min_win, u32 max_win); int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, struct tipc_link *link, int nlflags); int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]); diff --git a/net/tipc/node.c b/net/tipc/node.c index 742c047..9a56348 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -465,7 +465,8 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr, n->active_links[1] = INVALID_BEARER_ID; if (!tipc_link_bc_create(net, tipc_own_addr(net), addr, U16_MAX, - tipc_link_window(tipc_bc_sndlink(net)), + tipc_link_min_win(tipc_bc_sndlink(net)), + tipc_link_max_win(tipc_bc_sndlink(net)), n->capabilities, &n->bc_entry.inputq1, &n->bc_entry.namedq, @@ -1135,7 +1136,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, get_random_bytes(&session, sizeof(u16)); if (!tipc_link_create(net, if_name, b->identity, b->tolerance, b->net_plane, b->mtu, b->priority, - b->window, session, + b->min_win, b->max_win, session, tipc_own_addr(net), addr, peer_id, n->capabilities, tipc_bc_sndlink(n->net), n->bc_entry.link, @@ -2256,10 +2257,12 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) tipc_link_set_prio(link, prio, &xmitq); } if (props[TIPC_NLA_PROP_WIN]) { - u32 win; + u32 max_win; - win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); - tipc_link_set_queue_limits(link, win); + max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); + tipc_link_set_queue_limits(link, + tipc_link_min_win(link), + max_win); } } diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 43ca5fd..7bcc79a 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -824,7 +824,8 @@ struct tipc_media udp_media_info = { .msg2addr = tipc_udp_msg2addr, .priority = TIPC_DEF_LINK_PRI, .tolerance = TIPC_DEF_LINK_TOL, - .window = TIPC_DEF_LINK_WIN, + .min_win = TIPC_DEF_LINK_WIN, + .max_win = TIPC_DEF_LINK_WIN, .mtu = TIPC_DEF_LINK_UDP_MTU, .type_id = TIPC_MEDIA_TYPE_UDP, .hwaddr_len = 0, -- 2.1.4 |
From: Jon M. <jon...@er...> - 2019-11-15 15:48:02
|
Acked. ///jon > -----Original Message----- > From: Tung Nguyen <tun...@de...> > Sent: 15-Nov-19 04:08 > To: tip...@li...; Jon Maloy <jon...@er...>; ma...@do...; > yin...@wi... > Subject: [tipc-discussion] [net v1 1/1] tipc: fix unreset l->rcv_unacked after message retransmission > > When lost messages are retransmitted, they also carry ACK. > So, l->rcv_unacked needs to be reset. > > Fixes: 9195948fbf34 ("tipc: improve TIPC throughput by Gap ACK blocks") > Signed-off-by: Tung Nguyen <tun...@de...> > --- > net/tipc/link.c | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/net/tipc/link.c b/net/tipc/link.c > index fb72031..7660e5a 100644 > --- a/net/tipc/link.c > +++ b/net/tipc/link.c > @@ -1448,6 +1448,7 @@ static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, > msg_set_bcast_ack(hdr, bc_ack); > _skb->priority = TC_PRIO_CONTROL; > __skb_queue_tail(xmitq, _skb); > + l->rcv_unacked = 0; > l->stats.retransmitted++; > > /* Increase actual retrans counter & mark first time */ > -- > 2.1.4 |
From: Jon M. <jon...@er...> - 2019-11-15 15:31:41
|
Acked ///jon > -----Original Message----- > From: Tung Nguyen <tun...@de...> > Sent: 15-Nov-19 04:07 > To: tip...@li...; Jon Maloy <jon...@er...>; ma...@do...; > yin...@wi... > Subject: [tipc-discussion] [net v1 1/1] tipc: fix wrong timeout input for tipc_wait_for_cond() > > In function __tipc_shutdown(), the timeout value passed to > tipc_wait_for_cond() is not jiffies. > > This commit fixes it by converting that value from milliseconds > to jiffies. > > Fixes: 365ad353c256 ("tipc: reduce risk of user starvation during link congestion") > Signed-off-by: Tung Nguyen <tun...@de...> > --- > net/tipc/socket.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/net/tipc/socket.c b/net/tipc/socket.c > index d7be43f5..506d264 100644 > --- a/net/tipc/socket.c > +++ b/net/tipc/socket.c > @@ -532,7 +532,7 @@ static void __tipc_shutdown(struct socket *sock, int error) > struct sock *sk = sock->sk; > struct tipc_sock *tsk = tipc_sk(sk); > struct net *net = sock_net(sk); > - long timeout = CONN_TIMEOUT_DEFAULT; > + long timeout = msecs_to_jiffies(CONN_TIMEOUT_DEFAULT); > u32 dnode = tsk_peer_node(tsk); > struct sk_buff *skb; > > -- > 2.1.4 |
From: Tung N. <tun...@de...> - 2019-11-15 09:08:26
|
When lost messages are retransmitted, they also carry ACK. So, l->rcv_unacked needs to be reset. Fixes: 9195948fbf34 ("tipc: improve TIPC throughput by Gap ACK blocks") Signed-off-by: Tung Nguyen <tun...@de...> --- net/tipc/link.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/tipc/link.c b/net/tipc/link.c index fb72031..7660e5a 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1448,6 +1448,7 @@ static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, msg_set_bcast_ack(hdr, bc_ack); _skb->priority = TC_PRIO_CONTROL; __skb_queue_tail(xmitq, _skb); + l->rcv_unacked = 0; l->stats.retransmitted++; /* Increase actual retrans counter & mark first time */ -- 2.1.4 |
From: Tung N. <tun...@de...> - 2019-11-15 09:07:38
|
In function __tipc_shutdown(), the timeout value passed to tipc_wait_for_cond() is not jiffies. This commit fixes it by converting that value from milliseconds to jiffies. Fixes: 365ad353c256 ("tipc: reduce risk of user starvation during link congestion") Signed-off-by: Tung Nguyen <tun...@de...> --- net/tipc/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index d7be43f5..506d264 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -532,7 +532,7 @@ static void __tipc_shutdown(struct socket *sock, int error) struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); struct net *net = sock_net(sk); - long timeout = CONN_TIMEOUT_DEFAULT; + long timeout = msecs_to_jiffies(CONN_TIMEOUT_DEFAULT); u32 dnode = tsk_peer_node(tsk); struct sk_buff *skb; -- 2.1.4 |
From: David M. <da...@da...> - 2019-11-15 02:03:32
|
From: Matt Bennett <mat...@al...> Date: Thu, 14 Nov 2019 12:20:03 +1300 > The tipc prefix for log messages generated by tipc was > removed in commit 07f6c4bc048a ("tipc: convert tipc reference > table to use generic rhashtable"). > > This is still a useful prefix so add it back. > > Signed-off-by: Matt Bennett <mat...@al...> Applied. |
From: Jon M. <jon...@er...> - 2019-11-14 15:58:00
|
Hi Hoang, You are complicating this too much. The 'syn' and 'rcast' bits should remain internal for the bcast.c rcast/bcast code, and we should not mess with those elsewhere. Instead, you should add a bit in the NAME_DISTR header that marks if the message is bcast/rcast or bulk. For compatibility, this bit should be inverted, so that 1 means bcast/rcast ('not_bulk' or similar) and 0 means 'bulk'. Then, you add another bit, in bulk messages only, meaning 'not_last'. The last bulk message has this bit set to zero, and at that moment you can set the link->named_sync and deliver the deferred queue. As long as the link_named_sync bit is not set, you sort the messages as follows: - Incoming !not_bulk/not_last messages go to the head of the named_deferred queue. - Incoming not_bulk messages go to the tail of the named_deferred queue, irrespective of the not_last bit (which should be always zero anyway) - Incoming !not_bulk/!not_last message is sorted in after the last !not_bulk/not_last message in the queue, and named_sync is set. Because all old messages will be !not_bulk/!not_last (0/0) those will never go into the deferred queue, and named_sync will be set from the start, so we have guaranteed backwards compatibility. I leave it to you to find better name for those bits. I think that should solve our problem in a reasonably simple way. BR ///jon > -----Original Message----- > From: Hoang Le <hoa...@de...> > Sent: 14-Nov-19 03:09 > To: Jon Maloy <jon...@er...>; ma...@do...; tip...@li... > Cc: 'Ying Xue' <yin...@wi...> > Subject: RE: [net-next] tipc: update a binding service via broadcast > > Hi Jon, > > Please take a look at v2. The mechanism looks the same as I did before in commit: > c55c8edafa91 ("tipc: smooth change between replicast and broadcast") > However, in this case we handle only one direction: replicast -> broadcast. > Then, it is still backward compatible. > > [...] > From ae2ee6a7064de3ec1dc2c7df2db241d22b0d129f Mon Sep 17 00:00:00 2001 > From: Hoang Le <hoa...@de...> > Date: Wed, 13 Nov 2019 14:01:03 +0700 > Subject: [PATCH] tipc: update a binding service via broadcast > > Currently, updating binding table (add service binding to > name table/withdraw a service binding) is being sent over replicast. > However, if we are scaling up clusters to > 100 nodes/containers this > method is less affection because of looping through nodes in a cluster one > by one. > > It is worth to use broadcast to update a binding service. Then binding > table updates in all nodes for one shot. > > The mechanism is backward compatible because of sending side changing. > > v2: resolve synchronization problem when switching from unicast to > broadcast > > Signed-off-by: Hoang Le <hoa...@de...> > --- > net/tipc/bcast.c | 13 +++++++++++++ > net/tipc/bcast.h | 2 ++ > net/tipc/link.c | 16 ++++++++++++++++ > net/tipc/name_distr.c | 8 ++++++++ > net/tipc/name_table.c | 9 ++++++--- > 5 files changed, 45 insertions(+), 3 deletions(-) > > diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c > index f41096a759fa..18431fa897ab 100644 > --- a/net/tipc/bcast.c > +++ b/net/tipc/bcast.c > @@ -843,3 +843,16 @@ void tipc_mcast_filter_msg(struct net *net, struct sk_buff_head *defq, > __skb_queue_tail(inputq, _skb); > } > } > + > +int tipc_bcast_named_publish(struct net *net, struct sk_buff *skb) > +{ > + struct sk_buff_head xmitq; > + u16 cong_link_cnt; > + int rc = 0; > + > + __skb_queue_head_init(&xmitq); > + __skb_queue_tail(&xmitq, skb); > + rc = tipc_bcast_xmit(net, &xmitq, &cong_link_cnt); > + __skb_queue_purge(&xmitq); > + return rc; > +} > diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h > index dadad953e2be..a100da3800fc 100644 > --- a/net/tipc/bcast.h > +++ b/net/tipc/bcast.h > @@ -101,6 +101,8 @@ int tipc_bclink_reset_stats(struct net *net); > u32 tipc_bcast_get_broadcast_mode(struct net *net); > u32 tipc_bcast_get_broadcast_ratio(struct net *net); > > +int tipc_bcast_named_publish(struct net *net, struct sk_buff *skb); > + > void tipc_mcast_filter_msg(struct net *net, struct sk_buff_head *defq, > struct sk_buff_head *inputq); > > diff --git a/net/tipc/link.c b/net/tipc/link.c > index fb72031228c9..22f1854435df 100644 > --- a/net/tipc/link.c > +++ b/net/tipc/link.c > @@ -187,6 +187,9 @@ struct tipc_link { > struct tipc_link *bc_sndlink; > u8 nack_state; > bool bc_peer_is_up; > + bool named_sync; > + struct sk_buff_head defer_namedq; > + > > /* Statistics */ > struct tipc_stats stats; > @@ -363,6 +366,7 @@ void tipc_link_remove_bc_peer(struct tipc_link *snd_l, > trace_tipc_link_reset(rcv_l, TIPC_DUMP_ALL, "bclink removed!"); > tipc_link_reset(rcv_l); > rcv_l->state = LINK_RESET; > + rcv_l->named_sync = false; > if (!snd_l->ackers) { > trace_tipc_link_reset(snd_l, TIPC_DUMP_ALL, "zero ackers!"); > tipc_link_reset(snd_l); > @@ -508,6 +512,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, > __skb_queue_head_init(&l->failover_deferdq); > skb_queue_head_init(&l->wakeupq); > skb_queue_head_init(l->inputq); > + __skb_queue_head_init(&l->defer_namedq); > return true; > } > > @@ -932,6 +937,8 @@ void tipc_link_reset(struct tipc_link *l) > l->silent_intv_cnt = 0; > l->rst_cnt = 0; > l->bc_peer_is_up = false; > + l->named_sync = false; > + __skb_queue_purge(&l->defer_namedq); > memset(&l->mon_state, 0, sizeof(l->mon_state)); > tipc_link_reset_stats(l); > } > @@ -1210,6 +1217,15 @@ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb, > return true; > case NAME_DISTRIBUTOR: > l->bc_rcvlink->state = LINK_ESTABLISHED; > + if (msg_is_syn(hdr)) { > + l->bc_rcvlink->named_sync = true; > + skb_queue_splice_tail_init(&l->defer_namedq, l->namedq); I think you would leak the skb here. It is not added to the queue and not deleted. > + return true; > + } > + if (msg_is_rcast(hdr) && !l->bc_rcvlink->named_sync) { > + skb_queue_tail(&l->defer_namedq, skb); > + return true; > + } > skb_queue_tail(l->namedq, skb); > return true; > case MSG_BUNDLER: > diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c > index 5feaf3b67380..419b3f0f102d 100644 > --- a/net/tipc/name_distr.c > +++ b/net/tipc/name_distr.c > @@ -180,6 +180,14 @@ static void named_distribute(struct net *net, struct sk_buff_head *list, > skb_trim(skb, INT_H_SIZE + (msg_dsz - msg_rem)); > __skb_queue_tail(list, skb); > } > + > + /* Allocate dummy message in order to synchronize w/bcast */ > + skb = named_prepare_buf(net, PUBLICATION, 0, dnode); > + if (skb) { > + /* Preparing for 'synching' header */ > + msg_set_syn(buf_msg(skb), 1); > + __skb_queue_tail(list, skb); > + } > } > > /** > diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c > index 66a65c2cdb23..4ba6d73e5c4c 100644 > --- a/net/tipc/name_table.c > +++ b/net/tipc/name_table.c > @@ -632,8 +632,10 @@ struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower, > exit: > spin_unlock_bh(&tn->nametbl_lock); > > - if (skb) > - tipc_node_broadcast(net, skb); > + if (skb) { > + msg_set_is_rcast(buf_msg(skb), true); > + tipc_bcast_named_publish(net, skb); > + } > return p; > } > > @@ -664,7 +666,8 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, > spin_unlock_bh(&tn->nametbl_lock); > > if (skb) { > - tipc_node_broadcast(net, skb); > + msg_set_is_rcast(buf_msg(skb), true); > + tipc_bcast_named_publish(net, skb); > return 1; > } > return 0; > -- > 2.20.1 > > -----Original Message----- > From: Jon Maloy <jon...@er...> > Sent: Wednesday, November 13, 2019 7:02 PM > To: Hoang Huu Le <hoa...@de...>; ma...@do...; tipc- > dis...@li... > Cc: 'Ying Xue' <yin...@wi...> > Subject: RE: [net-next] tipc: update a binding service via broadcast > > Hi Hoang, > This is good, but you have missed the point about the synchronization problem I have been talking about. > > 1) A new node comes up > 2) The "bulk" binding table update is sent, as a series of packets over the new unicast link. This may take > some time. > 3) The owner of one of the bindings in the bulk (on this node) does unbind. > 4) This is sent as broadcast withdraw to all nodes, and arrives before the last packets of the unicast bulk to > the newly connected node. > 5) Since there is no corresponding publication in the peer node's binding table yet, the withdraw is ignored. > 6) The last bulk unicasts arrive at the new peer, and the now invalid publication is added to its binding > table. > 7) This publication will stay there forever. > > We need to find a way to synchronize so that we know that all the bulk publications are in place in the > binding table before any broadcast publications/withdraws can be accepted. > Obviously, we could create a backlog queue in the name table, but I hope we can find a simpler and neater > solution. > > Regards > ///jon > > > -----Original Message----- > > From: Hoang Le <hoa...@de...> > > Sent: 13-Nov-19 02:35 > > To: Jon Maloy <jon...@er...>; ma...@do...; tip...@li... > > Subject: [net-next] tipc: update a binding service via broadcast > > > > Currently, updating binding table (add service binding to > > name table/withdraw a service binding) is being sent over replicast. > > However, if we are scaling up clusters to > 100 nodes/containers this > > method is less affection because of looping through nodes in a cluster one > > by one. > > > > It is worth to use broadcast to update a binding service. Then binding > > table updates in all nodes for one shot. > > > > The mechanism is backward compatible because of sending side changing. > > > > Signed-off-by: Hoang Le <hoa...@de...> > > --- > > net/tipc/bcast.c | 13 +++++++++++++ > > net/tipc/bcast.h | 2 ++ > > net/tipc/name_table.c | 4 ++-- > > 3 files changed, 17 insertions(+), 2 deletions(-) > > > > diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c > > index f41096a759fa..18431fa897ab 100644 > > --- a/net/tipc/bcast.c > > +++ b/net/tipc/bcast.c > > @@ -843,3 +843,16 @@ void tipc_mcast_filter_msg(struct net *net, struct sk_buff_head *defq, > > __skb_queue_tail(inputq, _skb); > > } > > } > > + > > +int tipc_bcast_named_publish(struct net *net, struct sk_buff *skb) > > +{ > > + struct sk_buff_head xmitq; > > + u16 cong_link_cnt; > > + int rc = 0; > > + > > + __skb_queue_head_init(&xmitq); > > + __skb_queue_tail(&xmitq, skb); > > + rc = tipc_bcast_xmit(net, &xmitq, &cong_link_cnt); > > + __skb_queue_purge(&xmitq); > > + return rc; > > +} > > diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h > > index dadad953e2be..a100da3800fc 100644 > > --- a/net/tipc/bcast.h > > +++ b/net/tipc/bcast.h > > @@ -101,6 +101,8 @@ int tipc_bclink_reset_stats(struct net *net); > > u32 tipc_bcast_get_broadcast_mode(struct net *net); > > u32 tipc_bcast_get_broadcast_ratio(struct net *net); > > > > +int tipc_bcast_named_publish(struct net *net, struct sk_buff *skb); > > + > > void tipc_mcast_filter_msg(struct net *net, struct sk_buff_head *defq, > > struct sk_buff_head *inputq); > > > > diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c > > index 66a65c2cdb23..9e9c61f7c999 100644 > > --- a/net/tipc/name_table.c > > +++ b/net/tipc/name_table.c > > @@ -633,7 +633,7 @@ struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower, > > spin_unlock_bh(&tn->nametbl_lock); > > > > if (skb) > > - tipc_node_broadcast(net, skb); > > + tipc_bcast_named_publish(net, skb); > > return p; > > } > > > > @@ -664,7 +664,7 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, > > spin_unlock_bh(&tn->nametbl_lock); > > > > if (skb) { > > - tipc_node_broadcast(net, skb); > > + tipc_bcast_named_publish(net, skb); > > return 1; > > } > > return 0; > > -- > > 2.20.1 > |
From: Jon M. <jon...@er...> - 2019-11-14 14:47:44
|
See below. > -----Original Message----- > From: Tung Nguyen <tun...@de...> > Sent: 14-Nov-19 03:28 > To: tip...@li...; Jon Maloy <jon...@er...>; ma...@do...; > yin...@wi... > Subject: [tipc-discussion] [net v2 1/1] tipc: fix duplicate SYN messages under link congestion > > Scenario: > 1. A client socket initiates a SYN message to a listening socket. > 2. The send link is congested, the SYN message is put in the > send link and a wakeup message is put in wakeup queue. > 3. The congestion situation is abated, the wakeup message is > pulled out of the wakeup queue. Function tipc_sk_push_backlog() > is called to send out delayed messages by Nagle. However, > the client socket is still in CONNECTING state. So, it sends > the SYN message in the socket write queue to the listening socket > again. > 4. The listening socket receives the first SYN message and creates > first server socket. The client socket receives ACK- and establishes > a connection to the first server socket. The client socket closes > its connection with the first server socket. > 5. The listening socket receives the second SYN message and creates > second server socket. The second server socket sends ACK- to the > client socket, but it has been closed. It results in connection > refuse error when reading from the server socket in user space. > > Solution: return from function tipc_sk_push_backlog() immediately > if there is pending SYN message in the socket write queue. > > v2: use Jon's suggestion. > > Fixes: c0bceb97db9e ("tipc: add smart nagle feature") > Signed-off-by: Tung Nguyen <tun...@de...> > --- > net/tipc/socket.c | 16 ++++++++++------ > 1 file changed, 10 insertions(+), 6 deletions(-) > > diff --git a/net/tipc/socket.c b/net/tipc/socket.c > index 5d7859a..7a402ee 100644 > --- a/net/tipc/socket.c > +++ b/net/tipc/socket.c > @@ -540,12 +540,10 @@ static void __tipc_shutdown(struct socket *sock, int error) > tipc_wait_for_cond(sock, &timeout, (!tsk->cong_link_cnt && > !tsk_conn_cong(tsk))); > > - /* Push out unsent messages or remove if pending SYN */ > - skb = skb_peek(&sk->sk_write_queue); > - if (skb && !msg_is_syn(buf_msg(skb))) > - tipc_sk_push_backlog(tsk); > - else > - __skb_queue_purge(&sk->sk_write_queue); > + /* Push out delayed messages if in Nagle mode */ > + tipc_sk_push_backlog(tsk); > + /* Remove pending SYN */ > + __skb_queue_purge(&sk->sk_write_queue); > > /* Reject all unreceived messages, except on an active connection > * (which disconnects locally & sends a 'FIN+' to peer). > @@ -1248,11 +1246,17 @@ static void tipc_sk_push_backlog(struct tipc_sock *tsk) > struct sk_buff_head *txq = &tsk->sk.sk_write_queue; > struct net *net = sock_net(&tsk->sk); > u32 dnode = tsk_peer_node(tsk); > + struct sk_buff *skb; > int rc; > > if (skb_queue_empty(txq) || tsk->cong_link_cnt) > return; > > + skb = skb_peek(txq); Move this line to before the if() clause.Then test for if (!skb ||...) instead of skb_queue_empty(). This way we should save a couple of instructions. Acked-by:jon > + /* Do not send SYN again after congestion */ > + if (msg_is_syn(buf_msg(skb))) > + return; > + > tsk->snt_unacked += tsk->snd_backlog; > tsk->snd_backlog = 0; > tsk->expect_ack = true; > -- > 2.1.4 |
From: Tung N. <tun...@de...> - 2019-11-14 08:28:35
|
Scenario: 1. A client socket initiates a SYN message to a listening socket. 2. The send link is congested, the SYN message is put in the send link and a wakeup message is put in wakeup queue. 3. The congestion situation is abated, the wakeup message is pulled out of the wakeup queue. Function tipc_sk_push_backlog() is called to send out delayed messages by Nagle. However, the client socket is still in CONNECTING state. So, it sends the SYN message in the socket write queue to the listening socket again. 4. The listening socket receives the first SYN message and creates first server socket. The client socket receives ACK- and establishes a connection to the first server socket. The client socket closes its connection with the first server socket. 5. The listening socket receives the second SYN message and creates second server socket. The second server socket sends ACK- to the client socket, but it has been closed. It results in connection refuse error when reading from the server socket in user space. Solution: return from function tipc_sk_push_backlog() immediately if there is pending SYN message in the socket write queue. v2: use Jon's suggestion. Fixes: c0bceb97db9e ("tipc: add smart nagle feature") Signed-off-by: Tung Nguyen <tun...@de...> --- net/tipc/socket.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 5d7859a..7a402ee 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -540,12 +540,10 @@ static void __tipc_shutdown(struct socket *sock, int error) tipc_wait_for_cond(sock, &timeout, (!tsk->cong_link_cnt && !tsk_conn_cong(tsk))); - /* Push out unsent messages or remove if pending SYN */ - skb = skb_peek(&sk->sk_write_queue); - if (skb && !msg_is_syn(buf_msg(skb))) - tipc_sk_push_backlog(tsk); - else - __skb_queue_purge(&sk->sk_write_queue); + /* Push out delayed messages if in Nagle mode */ + tipc_sk_push_backlog(tsk); + /* Remove pending SYN */ + __skb_queue_purge(&sk->sk_write_queue); /* Reject all unreceived messages, except on an active connection * (which disconnects locally & sends a 'FIN+' to peer). @@ -1248,11 +1246,17 @@ static void tipc_sk_push_backlog(struct tipc_sock *tsk) struct sk_buff_head *txq = &tsk->sk.sk_write_queue; struct net *net = sock_net(&tsk->sk); u32 dnode = tsk_peer_node(tsk); + struct sk_buff *skb; int rc; if (skb_queue_empty(txq) || tsk->cong_link_cnt) return; + skb = skb_peek(txq); + /* Do not send SYN again after congestion */ + if (msg_is_syn(buf_msg(skb))) + return; + tsk->snt_unacked += tsk->snd_backlog; tsk->snd_backlog = 0; tsk->expect_ack = true; -- 2.1.4 |
From: Hoang L. <hoa...@de...> - 2019-11-14 08:09:30
|
Hi Jon, Please take a look at v2. The mechanism looks the same as I did before in commit: c55c8edafa91 ("tipc: smooth change between replicast and broadcast") However, in this case we handle only one direction: replicast -> broadcast. Then, it is still backward compatible. [...] >From ae2ee6a7064de3ec1dc2c7df2db241d22b0d129f Mon Sep 17 00:00:00 2001 From: Hoang Le <hoa...@de...> Date: Wed, 13 Nov 2019 14:01:03 +0700 Subject: [PATCH] tipc: update a binding service via broadcast Currently, updating binding table (add service binding to name table/withdraw a service binding) is being sent over replicast. However, if we are scaling up clusters to > 100 nodes/containers this method is less affection because of looping through nodes in a cluster one by one. It is worth to use broadcast to update a binding service. Then binding table updates in all nodes for one shot. The mechanism is backward compatible because of sending side changing. v2: resolve synchronization problem when switching from unicast to broadcast Signed-off-by: Hoang Le <hoa...@de...> --- net/tipc/bcast.c | 13 +++++++++++++ net/tipc/bcast.h | 2 ++ net/tipc/link.c | 16 ++++++++++++++++ net/tipc/name_distr.c | 8 ++++++++ net/tipc/name_table.c | 9 ++++++--- 5 files changed, 45 insertions(+), 3 deletions(-) diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index f41096a759fa..18431fa897ab 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -843,3 +843,16 @@ void tipc_mcast_filter_msg(struct net *net, struct sk_buff_head *defq, __skb_queue_tail(inputq, _skb); } } + +int tipc_bcast_named_publish(struct net *net, struct sk_buff *skb) +{ + struct sk_buff_head xmitq; + u16 cong_link_cnt; + int rc = 0; + + __skb_queue_head_init(&xmitq); + __skb_queue_tail(&xmitq, skb); + rc = tipc_bcast_xmit(net, &xmitq, &cong_link_cnt); + __skb_queue_purge(&xmitq); + return rc; +} diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index dadad953e2be..a100da3800fc 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -101,6 +101,8 @@ int tipc_bclink_reset_stats(struct net *net); u32 tipc_bcast_get_broadcast_mode(struct net *net); u32 tipc_bcast_get_broadcast_ratio(struct net *net); +int tipc_bcast_named_publish(struct net *net, struct sk_buff *skb); + void tipc_mcast_filter_msg(struct net *net, struct sk_buff_head *defq, struct sk_buff_head *inputq); diff --git a/net/tipc/link.c b/net/tipc/link.c index fb72031228c9..22f1854435df 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -187,6 +187,9 @@ struct tipc_link { struct tipc_link *bc_sndlink; u8 nack_state; bool bc_peer_is_up; + bool named_sync; + struct sk_buff_head defer_namedq; + /* Statistics */ struct tipc_stats stats; @@ -363,6 +366,7 @@ void tipc_link_remove_bc_peer(struct tipc_link *snd_l, trace_tipc_link_reset(rcv_l, TIPC_DUMP_ALL, "bclink removed!"); tipc_link_reset(rcv_l); rcv_l->state = LINK_RESET; + rcv_l->named_sync = false; if (!snd_l->ackers) { trace_tipc_link_reset(snd_l, TIPC_DUMP_ALL, "zero ackers!"); tipc_link_reset(snd_l); @@ -508,6 +512,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, __skb_queue_head_init(&l->failover_deferdq); skb_queue_head_init(&l->wakeupq); skb_queue_head_init(l->inputq); + __skb_queue_head_init(&l->defer_namedq); return true; } @@ -932,6 +937,8 @@ void tipc_link_reset(struct tipc_link *l) l->silent_intv_cnt = 0; l->rst_cnt = 0; l->bc_peer_is_up = false; + l->named_sync = false; + __skb_queue_purge(&l->defer_namedq); memset(&l->mon_state, 0, sizeof(l->mon_state)); tipc_link_reset_stats(l); } @@ -1210,6 +1217,15 @@ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb, return true; case NAME_DISTRIBUTOR: l->bc_rcvlink->state = LINK_ESTABLISHED; + if (msg_is_syn(hdr)) { + l->bc_rcvlink->named_sync = true; + skb_queue_splice_tail_init(&l->defer_namedq, l->namedq); + return true; + } + if (msg_is_rcast(hdr) && !l->bc_rcvlink->named_sync) { + skb_queue_tail(&l->defer_namedq, skb); + return true; + } skb_queue_tail(l->namedq, skb); return true; case MSG_BUNDLER: diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 5feaf3b67380..419b3f0f102d 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -180,6 +180,14 @@ static void named_distribute(struct net *net, struct sk_buff_head *list, skb_trim(skb, INT_H_SIZE + (msg_dsz - msg_rem)); __skb_queue_tail(list, skb); } + + /* Allocate dummy message in order to synchronize w/bcast */ + skb = named_prepare_buf(net, PUBLICATION, 0, dnode); + if (skb) { + /* Preparing for 'synching' header */ + msg_set_syn(buf_msg(skb), 1); + __skb_queue_tail(list, skb); + } } /** diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 66a65c2cdb23..4ba6d73e5c4c 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -632,8 +632,10 @@ struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower, exit: spin_unlock_bh(&tn->nametbl_lock); - if (skb) - tipc_node_broadcast(net, skb); + if (skb) { + msg_set_is_rcast(buf_msg(skb), true); + tipc_bcast_named_publish(net, skb); + } return p; } @@ -664,7 +666,8 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, spin_unlock_bh(&tn->nametbl_lock); if (skb) { - tipc_node_broadcast(net, skb); + msg_set_is_rcast(buf_msg(skb), true); + tipc_bcast_named_publish(net, skb); return 1; } return 0; -- 2.20.1 -----Original Message----- From: Jon Maloy <jon...@er...> Sent: Wednesday, November 13, 2019 7:02 PM To: Hoang Huu Le <hoa...@de...>; ma...@do...; tip...@li... Cc: 'Ying Xue' <yin...@wi...> Subject: RE: [net-next] tipc: update a binding service via broadcast Hi Hoang, This is good, but you have missed the point about the synchronization problem I have been talking about. 1) A new node comes up 2) The "bulk" binding table update is sent, as a series of packets over the new unicast link. This may take some time. 3) The owner of one of the bindings in the bulk (on this node) does unbind. 4) This is sent as broadcast withdraw to all nodes, and arrives before the last packets of the unicast bulk to the newly connected node. 5) Since there is no corresponding publication in the peer node's binding table yet, the withdraw is ignored. 6) The last bulk unicasts arrive at the new peer, and the now invalid publication is added to its binding table. 7) This publication will stay there forever. We need to find a way to synchronize so that we know that all the bulk publications are in place in the binding table before any broadcast publications/withdraws can be accepted. Obviously, we could create a backlog queue in the name table, but I hope we can find a simpler and neater solution. Regards ///jon > -----Original Message----- > From: Hoang Le <hoa...@de...> > Sent: 13-Nov-19 02:35 > To: Jon Maloy <jon...@er...>; ma...@do...; tip...@li... > Subject: [net-next] tipc: update a binding service via broadcast > > Currently, updating binding table (add service binding to > name table/withdraw a service binding) is being sent over replicast. > However, if we are scaling up clusters to > 100 nodes/containers this > method is less affection because of looping through nodes in a cluster one > by one. > > It is worth to use broadcast to update a binding service. Then binding > table updates in all nodes for one shot. > > The mechanism is backward compatible because of sending side changing. > > Signed-off-by: Hoang Le <hoa...@de...> > --- > net/tipc/bcast.c | 13 +++++++++++++ > net/tipc/bcast.h | 2 ++ > net/tipc/name_table.c | 4 ++-- > 3 files changed, 17 insertions(+), 2 deletions(-) > > diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c > index f41096a759fa..18431fa897ab 100644 > --- a/net/tipc/bcast.c > +++ b/net/tipc/bcast.c > @@ -843,3 +843,16 @@ void tipc_mcast_filter_msg(struct net *net, struct sk_buff_head *defq, > __skb_queue_tail(inputq, _skb); > } > } > + > +int tipc_bcast_named_publish(struct net *net, struct sk_buff *skb) > +{ > + struct sk_buff_head xmitq; > + u16 cong_link_cnt; > + int rc = 0; > + > + __skb_queue_head_init(&xmitq); > + __skb_queue_tail(&xmitq, skb); > + rc = tipc_bcast_xmit(net, &xmitq, &cong_link_cnt); > + __skb_queue_purge(&xmitq); > + return rc; > +} > diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h > index dadad953e2be..a100da3800fc 100644 > --- a/net/tipc/bcast.h > +++ b/net/tipc/bcast.h > @@ -101,6 +101,8 @@ int tipc_bclink_reset_stats(struct net *net); > u32 tipc_bcast_get_broadcast_mode(struct net *net); > u32 tipc_bcast_get_broadcast_ratio(struct net *net); > > +int tipc_bcast_named_publish(struct net *net, struct sk_buff *skb); > + > void tipc_mcast_filter_msg(struct net *net, struct sk_buff_head *defq, > struct sk_buff_head *inputq); > > diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c > index 66a65c2cdb23..9e9c61f7c999 100644 > --- a/net/tipc/name_table.c > +++ b/net/tipc/name_table.c > @@ -633,7 +633,7 @@ struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower, > spin_unlock_bh(&tn->nametbl_lock); > > if (skb) > - tipc_node_broadcast(net, skb); > + tipc_bcast_named_publish(net, skb); > return p; > } > > @@ -664,7 +664,7 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, > spin_unlock_bh(&tn->nametbl_lock); > > if (skb) { > - tipc_node_broadcast(net, skb); > + tipc_bcast_named_publish(net, skb); > return 1; > } > return 0; > -- > 2.20.1 |
From: Tuong L. <tuo...@de...> - 2019-11-14 06:43:59
|
It is observed that TIPC service binding order will not be kept in the publication event report to user if the service is subscribed after the bindings. For example, services are bound by application in the following order: Server: bound port A to {18888,66,66} scope 2 Server: bound port A to {18888,33,33} scope 2 Now, if a client subscribes to the service range (e.g. {18888, 0-100}), it will get the 'TIPC_PUBLISHED' events in that binding order only when the subscription is started before the bindings. Otherwise, if started after the bindings, the events will arrive in the opposite order: Client: received event for published {18888,33,33} Client: received event for published {18888,66,66} For the latter case, it is clear that the bindings have existed in the name table already, so when reported, the events' order will follow the order of the rbtree binding nodes (- a node with lesser 'lower'/'upper' range value will be first). This is correct as we provide the tracking on a specific service status (available or not), not the relationship between multiple services. However, some users expect to see the same order of arriving events irrespective of when the subscription is issued. This turns out to be easy to fix. We now add functionality to ensure that publication events always are issued in the same temporal order as the corresponding bindings were performed. v2: extend the sorting per publications, so guarantee a temporal order even if users set their subscription filter to 'TIPC_SUB_PORTS'. Acked-by: Jon Maloy <jon...@er...> Signed-off-by: Tuong Lien <tuo...@de...> --- net/tipc/name_table.c | 47 +++++++++++++++++++++++++++++++++++++++-------- net/tipc/name_table.h | 4 ++++ 2 files changed, 43 insertions(+), 8 deletions(-) diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 66a65c2cdb23..45a9ab539fdb 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -35,6 +35,7 @@ */ #include <net/sock.h> +#include <linux/list_sort.h> #include "core.h" #include "netlink.h" #include "name_table.h" @@ -66,6 +67,7 @@ struct service_range { /** * struct tipc_service - container for all published instances of a service type * @type: 32 bit 'type' value for service + * @publ_cnt: increasing counter for publications in this service * @ranges: rb tree containing all service ranges for this service * @service_list: links to adjacent name ranges in hash chain * @subscriptions: list of subscriptions for this service type @@ -74,6 +76,7 @@ struct service_range { */ struct tipc_service { u32 type; + unsigned int publ_cnt; struct rb_root ranges; struct hlist_node service_list; struct list_head subscriptions; @@ -109,6 +112,7 @@ static struct publication *tipc_publ_create(u32 type, u32 lower, u32 upper, INIT_LIST_HEAD(&publ->binding_node); INIT_LIST_HEAD(&publ->local_publ); INIT_LIST_HEAD(&publ->all_publ); + INIT_LIST_HEAD(&publ->list); return publ; } @@ -244,6 +248,7 @@ static struct publication *tipc_service_insert_publ(struct net *net, p = tipc_publ_create(type, lower, upper, scope, node, port, key); if (!p) goto err; + p->id = sc->publ_cnt++; if (in_own_node(net, node)) list_add(&p->local_publ, &sr->local_publ); list_add(&p->all_publ, &sr->all_publ); @@ -277,6 +282,17 @@ static struct publication *tipc_service_remove_publ(struct service_range *sr, return NULL; } +#define publication_after(pa, pb) (((int)((pb)->id - (pa)->id) < 0)) +static int tipc_publ_sort(void *priv, struct list_head *a, + struct list_head *b) +{ + struct publication *pa, *pb; + + pa = container_of(a, struct publication, list); + pb = container_of(b, struct publication, list); + return publication_after(pa, pb); +} + /** * tipc_service_subscribe - attach a subscription, and optionally * issue the prescribed number of events if there is any service @@ -286,36 +302,51 @@ static void tipc_service_subscribe(struct tipc_service *service, struct tipc_subscription *sub) { struct tipc_subscr *sb = &sub->evt.s; + struct publication *p, *first, *tmp; + struct list_head publ_list; struct service_range *sr; struct tipc_name_seq ns; - struct publication *p; struct rb_node *n; - bool first; + u32 filter; ns.type = tipc_sub_read(sb, seq.type); ns.lower = tipc_sub_read(sb, seq.lower); ns.upper = tipc_sub_read(sb, seq.upper); + filter = tipc_sub_read(sb, filter); tipc_sub_get(sub); list_add(&sub->service_list, &service->subscriptions); - if (tipc_sub_read(sb, filter) & TIPC_SUB_NO_STATUS) + if (filter & TIPC_SUB_NO_STATUS) return; + INIT_LIST_HEAD(&publ_list); for (n = rb_first(&service->ranges); n; n = rb_next(n)) { sr = container_of(n, struct service_range, tree_node); if (sr->lower > ns.upper) break; if (!tipc_sub_check_overlap(&ns, sr->lower, sr->upper)) continue; - first = true; + first = NULL; list_for_each_entry(p, &sr->all_publ, all_publ) { - tipc_sub_report_overlap(sub, sr->lower, sr->upper, - TIPC_PUBLISHED, p->port, - p->node, p->scope, first); - first = false; + if (filter & TIPC_SUB_PORTS) + list_add_tail(&p->list, &publ_list); + else if (!first || publication_after(first, p)) + /* Pick this range's *first* publication */ + first = p; } + if (first) + list_add_tail(&first->list, &publ_list); + } + + /* Sort the publications before reporting */ + list_sort(NULL, &publ_list, tipc_publ_sort); + list_for_each_entry_safe(p, tmp, &publ_list, list) { + tipc_sub_report_overlap(sub, p->lower, p->upper, + TIPC_PUBLISHED, p->port, p->node, + p->scope, true); + list_del_init(&p->list); } } diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h index f79066334cc8..3d5da71ce41e 100644 --- a/net/tipc/name_table.h +++ b/net/tipc/name_table.h @@ -58,6 +58,7 @@ struct tipc_group; * @node: network address of publishing socket's node * @port: publishing port * @key: publication key, unique across the cluster + * @id: publication id * @binding_node: all publications from the same node which bound this one * - Remote publications: in node->publ_list * Used by node/name distr to withdraw publications when node is lost @@ -69,6 +70,7 @@ struct tipc_group; * Used by closest_first and multicast receive lookup algorithms * @all_publ: all publications identical to this one, whatever node and scope * Used by round-robin lookup algorithm + * @list: to form a list of publications in temporal order * @rcu: RCU callback head used for deferred freeing */ struct publication { @@ -79,10 +81,12 @@ struct publication { u32 node; u32 port; u32 key; + unsigned int id; struct list_head binding_node; struct list_head binding_sock; struct list_head local_publ; struct list_head all_publ; + struct list_head list; struct rcu_head rcu; }; -- 2.13.7 |
From: Tuong L. <tuo...@de...> - 2019-11-14 06:43:49
|
The series consists of two individual patches which were sent earlier but now v2 with some minor updates. Tuong Lien (2): tipc: support in-order name publication events tipc: fix name table rbtree issues net/tipc/name_table.c | 326 +++++++++++++++++++++++++++++++++----------------- net/tipc/name_table.h | 4 + 2 files changed, 222 insertions(+), 108 deletions(-) -- 2.13.7 |
From: Tuong L. <tuo...@de...> - 2019-11-14 06:43:49
|
The current rbtree for service ranges in the name table is built based on the 'lower' & 'upper' range values resulting in a flaw in the rbtree searching. Some issues have been observed in case of range overlapping: Case #1: unable to withdraw a name entry: After some name services are bound, all of them are withdrawn by user but one remains in the name table forever. This corrupts the table and that service becomes dummy i.e. no real port. E.g. / {22, 22} / / ---> {10, 50} / \ / \ {10, 30} {20, 60} The node {10, 30} cannot be removed since the rbtree searching stops at the node's ancestor i.e. {10, 50}, so starting from it will never reach the finding node. Case #2: failed to send data in some cases: E.g. Two service ranges: {20, 60}, {10, 50} are bound. The rbtree for this service will be one of the two cases below depending on the order of the bindings: {20, 60} {10, 50} <-- / \ / \ / \ / \ {10, 50} NIL <-- NIL {20, 60} (a) (b) Now, try to send some data to service {30}, there will be two results: (a): Failed, no route to host. (b): Ok. The reason is that the rbtree searching will stop at the pointing node as shown above. Case #3: Same as case #2b above but if the data sending's scope is local and the {10, 50} is published by a peer node, then it will result in 'no route to host' even though the other {20, 60} is for example on the local node which should be able to get the data. The issues are actually due to the way we built the rbtree. This commit fixes it by introducing an additional field to each node - named 'max', which is the largest 'upper' of that node subtree. The 'max' value for each subtrees will be propagated correctly whenever a node is inserted/ removed or the tree is rebalanced by the augmented rbtree callbacks. By this way, we can change the rbtree searching appoarch to solve the issues above. Another benefit from this is that we can now improve the searching for a next range matching e.g. in case of multicast, so get rid of the unneeded looping over all nodes in the tree. v2: remove the 'round-robin' case's description (i.e. not fixed) & call the service_range_match_first() directly as suggested by Jon; rebase to make a series. Acked-by: Jon Maloy <jon...@er...> Signed-off-by: Tuong Lien <tuo...@de...> --- net/tipc/name_table.c | 279 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 179 insertions(+), 100 deletions(-) diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 45a9ab539fdb..2290b09e080d 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -36,6 +36,7 @@ #include <net/sock.h> #include <linux/list_sort.h> +#include <linux/rbtree_augmented.h> #include "core.h" #include "netlink.h" #include "name_table.h" @@ -51,6 +52,7 @@ * @lower: service range lower bound * @upper: service range upper bound * @tree_node: member of service range RB tree + * @max: largest 'upper' in this node subtree * @local_publ: list of identical publications made from this node * Used by closest_first lookup and multicast lookup algorithm * @all_publ: all publications identical to this one, whatever node and scope @@ -60,6 +62,7 @@ struct service_range { u32 lower; u32 upper; struct rb_node tree_node; + u32 max; struct list_head local_publ; struct list_head all_publ; }; @@ -84,6 +87,130 @@ struct tipc_service { struct rcu_head rcu; }; +#define service_range_upper(sr) ((sr)->upper) +RB_DECLARE_CALLBACKS_MAX(static, sr_callbacks, + struct service_range, tree_node, u32, max, + service_range_upper) + +#define service_range_entry(rbtree_node) \ + (container_of(rbtree_node, struct service_range, tree_node)) + +#define service_range_overlap(sr, start, end) \ + ((sr)->lower <= (end) && (sr)->upper >= (start)) + +/** + * service_range_foreach_match - iterate over tipc service rbtree for each + * range match + * @sr: the service range pointer as a loop cursor + * @sc: the pointer to tipc service which holds the service range rbtree + * @start, end: the range (end >= start) for matching + */ +#define service_range_foreach_match(sr, sc, start, end) \ + for (sr = service_range_match_first((sc)->ranges.rb_node, \ + start, \ + end); \ + sr; \ + sr = service_range_match_next(&(sr)->tree_node, \ + start, \ + end)) + +/** + * service_range_match_first - find first service range matching a range + * @n: the root node of service range rbtree for searching + * @start, end: the range (end >= start) for matching + * + * Return: the leftmost service range node in the rbtree that overlaps the + * specific range if any. Otherwise, returns NULL. + */ +static struct service_range *service_range_match_first(struct rb_node *n, + u32 start, u32 end) +{ + struct service_range *sr; + struct rb_node *l, *r; + + /* Non overlaps in tree at all? */ + if (!n || service_range_entry(n)->max < start) + return NULL; + + while (n) { + l = n->rb_left; + if (l && service_range_entry(l)->max >= start) { + /* A leftmost overlap range node must be one in the left + * subtree. If not, it has lower > end, then nodes on + * the right side cannot satisfy the condition either. + */ + n = l; + continue; + } + + /* No one in the left subtree can match, return if this node is + * an overlap i.e. leftmost. + */ + sr = service_range_entry(n); + if (service_range_overlap(sr, start, end)) + return sr; + + /* Ok, try to lookup on the right side */ + r = n->rb_right; + if (sr->lower <= end && + r && service_range_entry(r)->max >= start) { + n = r; + continue; + } + break; + } + + return NULL; +} + +/** + * service_range_match_next - find next service range matching a range + * @n: a node in service range rbtree from which the searching starts + * @start, end: the range (end >= start) for matching + * + * Return: the next service range node to the given node in the rbtree that + * overlaps the specific range if any. Otherwise, returns NULL. + */ +static struct service_range *service_range_match_next(struct rb_node *n, + u32 start, u32 end) +{ + struct service_range *sr; + struct rb_node *p, *r; + + while (n) { + r = n->rb_right; + if (r && service_range_entry(r)->max >= start) + /* A next overlap range node must be one in the right + * subtree. If not, it has lower > end, then any next + * successor (- an ancestor) of this node cannot + * satisfy the condition either. + */ + return service_range_match_first(r, start, end); + + /* No one in the right subtree can match, go up to find an + * ancestor of this node which is parent of a left-hand child. + */ + while ((p = rb_parent(n)) && n == p->rb_right) + n = p; + if (!p) + break; + + /* Return if this ancestor is an overlap */ + sr = service_range_entry(p); + if (service_range_overlap(sr, start, end)) + return sr; + + /* Ok, try to lookup more from this ancestor */ + if (sr->lower <= end) { + n = p; + continue; + } + break; + } + + return NULL; +} + static int hash(int x) { return x & (TIPC_NAMETBL_SIZE - 1); @@ -139,84 +266,51 @@ static struct tipc_service *tipc_service_create(u32 type, struct hlist_head *hd) return service; } -/** - * tipc_service_first_range - find first service range in tree matching instance - * - * Very time-critical, so binary search through range rb tree - */ -static struct service_range *tipc_service_first_range(struct tipc_service *sc, - u32 instance) -{ - struct rb_node *n = sc->ranges.rb_node; - struct service_range *sr; - - while (n) { - sr = container_of(n, struct service_range, tree_node); - if (sr->lower > instance) - n = n->rb_left; - else if (sr->upper < instance) - n = n->rb_right; - else - return sr; - } - return NULL; -} - /* tipc_service_find_range - find service range matching publication parameters */ static struct service_range *tipc_service_find_range(struct tipc_service *sc, u32 lower, u32 upper) { - struct rb_node *n = sc->ranges.rb_node; struct service_range *sr; - sr = tipc_service_first_range(sc, lower); - if (!sr) - return NULL; - - /* Look for exact match */ - for (n = &sr->tree_node; n; n = rb_next(n)) { - sr = container_of(n, struct service_range, tree_node); - if (sr->upper == upper) - break; + service_range_foreach_match(sr, sc, lower, upper) { + /* Look for exact match */ + if (sr->lower == lower && sr->upper == upper) + return sr; } - if (!n || sr->lower != lower || sr->upper != upper) - return NULL; - return sr; + return NULL; } static struct service_range *tipc_service_create_range(struct tipc_service *sc, u32 lower, u32 upper) { struct rb_node **n, *parent = NULL; - struct service_range *sr, *tmp; + struct service_range *sr; n = &sc->ranges.rb_node; while (*n) { - tmp = container_of(*n, struct service_range, tree_node); parent = *n; - tmp = container_of(parent, struct service_range, tree_node); - if (lower < tmp->lower) - n = &(*n)->rb_left; - else if (lower > tmp->lower) - n = &(*n)->rb_right; - else if (upper < tmp->upper) - n = &(*n)->rb_left; - else if (upper > tmp->upper) - n = &(*n)->rb_right; + sr = service_range_entry(parent); + if (lower == sr->lower && upper == sr->upper) + return sr; + if (sr->max < upper) + sr->max = upper; + if (lower <= sr->lower) + n = &parent->rb_left; else - return tmp; + n = &parent->rb_right; } sr = kzalloc(sizeof(*sr), GFP_ATOMIC); if (!sr) return NULL; sr->lower = lower; sr->upper = upper; + sr->max = upper; INIT_LIST_HEAD(&sr->local_publ); INIT_LIST_HEAD(&sr->all_publ); rb_link_node(&sr->tree_node, parent, n); - rb_insert_color(&sr->tree_node, &sc->ranges); + rb_insert_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks); return sr; } @@ -306,7 +400,6 @@ static void tipc_service_subscribe(struct tipc_service *service, struct list_head publ_list; struct service_range *sr; struct tipc_name_seq ns; - struct rb_node *n; u32 filter; ns.type = tipc_sub_read(sb, seq.type); @@ -321,13 +414,7 @@ static void tipc_service_subscribe(struct tipc_service *service, return; INIT_LIST_HEAD(&publ_list); - for (n = rb_first(&service->ranges); n; n = rb_next(n)) { - sr = container_of(n, struct service_range, tree_node); - if (sr->lower > ns.upper) - break; - if (!tipc_sub_check_overlap(&ns, sr->lower, sr->upper)) - continue; - + service_range_foreach_match(sr, service, ns.lower, ns.upper) { first = NULL; list_for_each_entry(p, &sr->all_publ, all_publ) { if (filter & TIPC_SUB_PORTS) @@ -421,7 +508,7 @@ struct publication *tipc_nametbl_remove_publ(struct net *net, u32 type, /* Remove service range item if this was its last publication */ if (list_empty(&sr->all_publ)) { - rb_erase(&sr->tree_node, &sc->ranges); + rb_erase_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks); kfree(sr); } @@ -469,34 +556,39 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *dnode) rcu_read_lock(); sc = tipc_service_find(net, type); if (unlikely(!sc)) - goto not_found; + goto exit; spin_lock_bh(&sc->lock); - sr = tipc_service_first_range(sc, instance); - if (unlikely(!sr)) - goto no_match; - - /* Select lookup algorithm: local, closest-first or round-robin */ - if (*dnode == self) { - list = &sr->local_publ; - if (list_empty(list)) - goto no_match; - p = list_first_entry(list, struct publication, local_publ); - list_move_tail(&p->local_publ, &sr->local_publ); - } else if (legacy && !*dnode && !list_empty(&sr->local_publ)) { - list = &sr->local_publ; - p = list_first_entry(list, struct publication, local_publ); - list_move_tail(&p->local_publ, &sr->local_publ); - } else { - list = &sr->all_publ; - p = list_first_entry(list, struct publication, all_publ); - list_move_tail(&p->all_publ, &sr->all_publ); + service_range_foreach_match(sr, sc, instance, instance) { + /* Select lookup algo: local, closest-first or round-robin */ + if (*dnode == self) { + list = &sr->local_publ; + if (list_empty(list)) + continue; + p = list_first_entry(list, struct publication, + local_publ); + list_move_tail(&p->local_publ, &sr->local_publ); + } else if (legacy && !*dnode && !list_empty(&sr->local_publ)) { + list = &sr->local_publ; + p = list_first_entry(list, struct publication, + local_publ); + list_move_tail(&p->local_publ, &sr->local_publ); + } else { + list = &sr->all_publ; + p = list_first_entry(list, struct publication, + all_publ); + list_move_tail(&p->all_publ, &sr->all_publ); + } + port = p->port; + node = p->node; + /* Todo: as for legacy, pick the first matching range only, a + * "true" round-robin will be performed as needed. + */ + break; } - port = p->port; - node = p->node; -no_match: spin_unlock_bh(&sc->lock); -not_found: + +exit: rcu_read_unlock(); *dnode = node; return port; @@ -519,7 +611,8 @@ bool tipc_nametbl_lookup(struct net *net, u32 type, u32 instance, u32 scope, spin_lock_bh(&sc->lock); - sr = tipc_service_first_range(sc, instance); + /* Todo: a full search i.e. service_range_foreach_match() instead? */ + sr = service_range_match_first(sc->ranges.rb_node, instance, instance); if (!sr) goto no_match; @@ -548,7 +641,6 @@ void tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper, struct service_range *sr; struct tipc_service *sc; struct publication *p; - struct rb_node *n; rcu_read_lock(); sc = tipc_service_find(net, type); @@ -556,13 +648,7 @@ void tipc_nametbl_mc_lookup(struct net *net, u32 type, u32 lower, u32 upper, goto exit; spin_lock_bh(&sc->lock); - - for (n = rb_first(&sc->ranges); n; n = rb_next(n)) { - sr = container_of(n, struct service_range, tree_node); - if (sr->upper < lower) - continue; - if (sr->lower > upper) - break; + service_range_foreach_match(sr, sc, lower, upper) { list_for_each_entry(p, &sr->local_publ, local_publ) { if (p->scope == scope || (!exact && p->scope < scope)) tipc_dest_push(dports, 0, p->port); @@ -583,7 +669,6 @@ void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, struct service_range *sr; struct tipc_service *sc; struct publication *p; - struct rb_node *n; rcu_read_lock(); sc = tipc_service_find(net, type); @@ -591,13 +676,7 @@ void tipc_nametbl_lookup_dst_nodes(struct net *net, u32 type, u32 lower, goto exit; spin_lock_bh(&sc->lock); - - for (n = rb_first(&sc->ranges); n; n = rb_next(n)) { - sr = container_of(n, struct service_range, tree_node); - if (sr->upper < lower) - continue; - if (sr->lower > upper) - break; + service_range_foreach_match(sr, sc, lower, upper) { list_for_each_entry(p, &sr->all_publ, all_publ) { tipc_nlist_add(nodes, p->node); } @@ -795,7 +874,7 @@ static void tipc_service_delete(struct net *net, struct tipc_service *sc) tipc_service_remove_publ(sr, p->node, p->key); kfree_rcu(p, rcu); } - rb_erase(&sr->tree_node, &sc->ranges); + rb_erase_augmented(&sr->tree_node, &sc->ranges, &sr_callbacks); kfree(sr); } hlist_del_init_rcu(&sc->service_list); -- 2.13.7 |