From: Ying X. <yin...@wi...> - 2017-02-20 11:40:10
|
Commit d094c4d5f5 ("tipc: add subscription refcount to avoid invalid delete") accidently introduce the following deadlock scenarios: CPU1: CPU2: ---------- ---------------- tipc_nametbl_publish spin_lock_bh(&tn->nametbl_lock) tipc_nametbl_insert_publ tipc_nameseq_insert_publ tipc_subscrp_report_overlap tipc_subscrp_get tipc_subscrp_send_event tipc_close_conn tipc_subscrb_release_cb tipc_subscrb_delete tipc_subscrp_put tipc_subscrp_put tipc_subscrp_kref_release tipc_nametbl_unsubscribe spin_lock_bh(&tn->nametbl_lock) <<grab nametbl_lock again>> CPU1: CPU2: ---------- ---------------- tipc_nametbl_stop spin_lock_bh(&tn->nametbl_lock) tipc_purge_publications tipc_nameseq_remove_publ tipc_subscrp_report_overlap tipc_subscrp_get tipc_subscrp_send_event tipc_close_conn tipc_subscrb_release_cb tipc_subscrb_delete tipc_subscrp_put tipc_subscrp_put tipc_subscrp_kref_release tipc_nametbl_unsubscribe spin_lock_bh(&tn->nametbl_lock) <<grab nametbl_lock again>> The root cause of two deadlocks is that we have to hold nametbl lock when subscription is freed in tipc_subscrp_kref_release(). In order to eliminate the need of taking nametbl lock in tipc_subscrp_kref_release(), the functions protected by nametbl lock in tipc_subscrp_kref_release() are moved to other places step by step in the series. Ying Xue (5): tipc: advance the time of deleting subscription from subscriber->subscrp_list tipc: adjust the policy of holding subscription kref tipc: adjust policy that sub->timer holds subscription kref tipc: advance the time of calling tipc_nametbl_unsubscribe tipc: remove unnecessary increasement of subscription refcount net/tipc/name_table.c | 2 ++ net/tipc/subscr.c | 32 ++++++++++++++------------------ net/tipc/subscr.h | 3 +++ 3 files changed, 19 insertions(+), 18 deletions(-) -- 2.7.4 |
From: Ying X. <yin...@wi...> - 2017-02-20 11:39:47
|
To make it more conventional to hold subscription refcount in timer, its policy is adjusted as follows: Before sub->timer is started, subscription refcount is held; when sub->timer is expired, subscription refcount will be decreased at the end of the timer timeout function; when the timer is stopped, refcount also needs to be decreased if the timer is still active. Signed-off-by: Ying Xue <yin...@wi...> --- net/tipc/subscr.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 111d33c6..ffd7b9d 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -241,8 +241,10 @@ static void tipc_subscrp_delete(struct tipc_subscription *sub) { u32 timeout = htohl(sub->evt.s.timeout, sub->swap); - if (timeout == TIPC_WAIT_FOREVER || del_timer(&sub->timer)) + if (timeout != TIPC_WAIT_FOREVER && del_timer(&sub->timer)) tipc_subscrp_put(sub); + + tipc_subscrp_put(sub); } static void tipc_subscrp_cancel(struct tipc_subscr *s, @@ -303,16 +305,18 @@ static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s, spin_lock_bh(&subscriber->lock); list_add(&sub->subscrp_list, &subscriber->subscrp_list); + tipc_subscrb_get(subscriber); sub->subscriber = subscriber; + tipc_nametbl_subscribe(sub); - tipc_subscrb_get(subscriber); spin_unlock_bh(&subscriber->lock); setup_timer(&sub->timer, tipc_subscrp_timeout, (unsigned long)sub); timeout = htohl(sub->evt.s.timeout, swap); - if (timeout != TIPC_WAIT_FOREVER) - mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout)); + if ((timeout != TIPC_WAIT_FOREVER) && + !mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout))) + tipc_subscrp_get(sub); } /* Handle one termination request for the subscriber */ -- 2.7.4 |
From: Ying X. <yin...@wi...> - 2017-02-20 11:39:47
|
When subscription is inserted into subscriber->subscrp_list, the subscription is also inserted to name sequence subscriptions list. Now when subscription is deleted from subscriber->subscrp_list in tipc_subscrb_subscrp_delete(), it's also a proper time to delete the subscription from name sequence subscriptions list too. Signed-off-by: Ying Xue <yin...@wi...> --- net/tipc/subscr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index ffd7b9d..6624232 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -169,7 +169,6 @@ static void tipc_subscrp_kref_release(struct kref *kref) struct tipc_subscriber *subscriber = sub->subscriber; spin_lock_bh(&subscriber->lock); - tipc_nametbl_unsubscribe(sub); atomic_dec(&tn->subscription_count); spin_unlock_bh(&subscriber->lock); kfree(sub); @@ -202,6 +201,7 @@ static void tipc_subscrb_subscrp_delete(struct tipc_subscriber *subscriber, list_del(&sub->subscrp_list); + tipc_nametbl_unsubscribe(sub); tipc_subscrp_get(sub); spin_unlock_bh(&subscriber->lock); tipc_subscrp_delete(sub); -- 2.7.4 |
From: Ying X. <yin...@wi...> - 2017-02-20 11:39:48
|
After a subscription object is created, it's inserted into its subscriber subscrp_list list under subscriber lock protection, similarly, before it's destroyed, it should be first removed from its subscriber->subscrp_list. Signed-off-by: Ying Xue <yin...@wi...> --- net/tipc/subscr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 9d94e65..e70e7ba 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -174,7 +174,6 @@ static void tipc_subscrp_kref_release(struct kref *kref) spin_lock_bh(&subscriber->lock); tipc_nametbl_unsubscribe(sub); - list_del(&sub->subscrp_list); atomic_dec(&tn->subscription_count); spin_unlock_bh(&subscriber->lock); kfree(sub); @@ -205,6 +204,8 @@ static void tipc_subscrb_subscrp_delete(struct tipc_subscriber *subscriber, if (s && memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) continue; + list_del(&sub->subscrp_list); + tipc_subscrp_get(sub); spin_unlock_bh(&subscriber->lock); tipc_subscrp_delete(sub); @@ -305,6 +306,7 @@ static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s, spin_lock_bh(&subscriber->lock); list_add(&sub->subscrp_list, &subscriber->subscrp_list); + sub->subscriber = subscriber; tipc_nametbl_subscribe(sub); tipc_subscrb_get(subscriber); -- 2.7.4 |
From: Ying X. <yin...@wi...> - 2017-02-20 11:40:14
|
When a new subscription object is inserted into name_seq->subscriptions list, it's under name_seq->lock protection; when a subscription is deleted from the list, it's also under the same lock protection; similarly, when accessing a subscription by going through subscriptions list, the entire process is also protected by the name_seq->lock. Therefore, if subscription refcount is increased before it's inserted into subscriptions list, and its refcount is decreased after it's deleted from the list, it will be unnecessary to hold refcount at all before accessing subscription object which is obtained by going through subscriptions list under name_seq->lock protection. Signed-off-by: Ying Xue <yin...@wi...> --- net/tipc/name_table.c | 2 ++ net/tipc/subscr.c | 8 ++------ net/tipc/subscr.h | 3 +++ 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index e190460..ca3c69e 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -416,6 +416,7 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq, tipc_subscrp_convert_seq(&s->evt.s.seq, s->swap, &ns); + tipc_subscrp_get(s); list_add(&s->nameseq_list, &nseq->subscriptions); if (!sseq) @@ -754,6 +755,7 @@ void tipc_nametbl_unsubscribe(struct tipc_subscription *s) if (seq != NULL) { spin_lock_bh(&seq->lock); list_del_init(&s->nameseq_list); + tipc_subscrp_put(s); if (!seq->first_free && list_empty(&seq->subscriptions)) { hlist_del_init_rcu(&seq->ns_list); kfree(seq->sseqs); diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index e70e7ba..111d33c6 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -54,8 +54,6 @@ struct tipc_subscriber { static void tipc_subscrp_delete(struct tipc_subscription *sub); static void tipc_subscrb_put(struct tipc_subscriber *subscriber); -static void tipc_subscrp_put(struct tipc_subscription *subscription); -static void tipc_subscrp_get(struct tipc_subscription *subscription); /** * htohl - convert value to endianness used by destination @@ -125,7 +123,6 @@ void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, { struct tipc_name_seq seq; - tipc_subscrp_get(sub); tipc_subscrp_convert_seq(&sub->evt.s.seq, sub->swap, &seq); if (!tipc_subscrp_check_overlap(&seq, found_lower, found_upper)) return; @@ -135,7 +132,6 @@ void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, tipc_subscrp_send_event(sub, found_lower, found_upper, event, port_ref, node); - tipc_subscrp_put(sub); } static void tipc_subscrp_timeout(unsigned long data) @@ -180,12 +176,12 @@ static void tipc_subscrp_kref_release(struct kref *kref) tipc_subscrb_put(subscriber); } -static void tipc_subscrp_put(struct tipc_subscription *subscription) +void tipc_subscrp_put(struct tipc_subscription *subscription) { kref_put(&subscription->kref, tipc_subscrp_kref_release); } -static void tipc_subscrp_get(struct tipc_subscription *subscription) +void tipc_subscrp_get(struct tipc_subscription *subscription) { kref_get(&subscription->kref); } diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h index ffdc214..ee52957 100644 --- a/net/tipc/subscr.h +++ b/net/tipc/subscr.h @@ -78,4 +78,7 @@ u32 tipc_subscrp_convert_seq_type(u32 type, int swap); int tipc_topsrv_start(struct net *net); void tipc_topsrv_stop(struct net *net); +void tipc_subscrp_put(struct tipc_subscription *subscription); +void tipc_subscrp_get(struct tipc_subscription *subscription); + #endif -- 2.7.4 |
From: Ying X. <yin...@wi...> - 2017-02-20 11:40:19
|
As the policy of holding subscription in subscriber time has been adjusted, it's unnecessary to hold subscription refcount before tipc_subscrp_delete() is called. As a consequence, subscriber->lock can be safely removed to avoid the following two deadlock scenarios: CPU1: CPU2: ---------- ---------------- tipc_nametbl_publish spin_lock_bh(&tn->nametbl_lock) tipc_nametbl_insert_publ tipc_nameseq_insert_publ tipc_subscrp_report_overlap tipc_subscrp_get tipc_subscrp_send_event tipc_close_conn tipc_subscrb_release_cb tipc_subscrb_delete tipc_subscrp_put tipc_subscrp_put tipc_subscrp_kref_release tipc_nametbl_unsubscribe spin_lock_bh(&tn->nametbl_lock) <<grab nametbl_lock again>> CPU1: CPU2: ---------- ---------------- tipc_nametbl_stop spin_lock_bh(&tn->nametbl_lock) tipc_purge_publications tipc_nameseq_remove_publ tipc_subscrp_report_overlap tipc_subscrp_get tipc_subscrp_send_event tipc_close_conn tipc_subscrb_release_cb tipc_subscrb_delete tipc_subscrp_put tipc_subscrp_put tipc_subscrp_kref_release tipc_nametbl_unsubscribe spin_lock_bh(&tn->nametbl_lock) <<grab nametbl_lock again>> Reported-by: John Thompson <tho...@gm...> Reported-by: Jon Maloy <jon...@er...> Fixes: d094c4d5f5 ("tipc: add subscription refcount to avoid invalid delete") Signed-off-by: Ying Xue <yin...@wi...> --- net/tipc/subscr.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 6624232..ea67cce 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -168,9 +168,7 @@ static void tipc_subscrp_kref_release(struct kref *kref) struct tipc_net *tn = net_generic(sub->net, tipc_net_id); struct tipc_subscriber *subscriber = sub->subscriber; - spin_lock_bh(&subscriber->lock); atomic_dec(&tn->subscription_count); - spin_unlock_bh(&subscriber->lock); kfree(sub); tipc_subscrb_put(subscriber); } @@ -202,11 +200,7 @@ static void tipc_subscrb_subscrp_delete(struct tipc_subscriber *subscriber, list_del(&sub->subscrp_list); tipc_nametbl_unsubscribe(sub); - tipc_subscrp_get(sub); - spin_unlock_bh(&subscriber->lock); tipc_subscrp_delete(sub); - tipc_subscrp_put(sub); - spin_lock_bh(&subscriber->lock); if (s) break; -- 2.7.4 |
From: John T. <tho...@gm...> - 2017-02-20 22:13:49
|
On Tue, Feb 21, 2017 at 12:39 AM, Ying Xue <yin...@wi...> wrote: > As the policy of holding subscription in subscriber time has been > adjusted, it's unnecessary to hold subscription refcount before > tipc_subscrp_delete() is called. As a consequence, subscriber->lock > can be safely removed to avoid the following two deadlock scenarios: > > CPU1: CPU2: > ---------- ---------------- > tipc_nametbl_publish > spin_lock_bh(&tn->nametbl_lock) > tipc_nametbl_insert_publ > tipc_nameseq_insert_publ > tipc_subscrp_report_overlap > tipc_subscrp_get > tipc_subscrp_send_event > tipc_close_conn > tipc_subscrb_release_cb > tipc_subscrb_delete > tipc_subscrp_put > tipc_subscrp_put > tipc_subscrp_kref_release > tipc_nametbl_unsubscribe > spin_lock_bh(&tn->nametbl_lock) > <<grab nametbl_lock again>> > > CPU1: CPU2: > ---------- ---------------- > tipc_nametbl_stop > spin_lock_bh(&tn->nametbl_lock) > tipc_purge_publications > tipc_nameseq_remove_publ > tipc_subscrp_report_overlap > tipc_subscrp_get > tipc_subscrp_send_event > tipc_close_conn > tipc_subscrb_release_cb > tipc_subscrb_delete > tipc_subscrp_put > tipc_subscrp_put > tipc_subscrp_kref_release > tipc_nametbl_unsubscribe > spin_lock_bh(&tn->nametbl_lock) > <<grab nametbl_lock again>> > > Reported-by: John Thompson <tho...@gm...> > Reported-by: Jon Maloy <jon...@er...> > Fixes: d094c4d5f5 ("tipc: add subscription refcount to avoid invalid > delete") > Signed-off-by: Ying Xue <yin...@wi...> > --- > net/tipc/subscr.c | 6 ------ > 1 file changed, 6 deletions(-) > > diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c > index 6624232..ea67cce 100644 > --- a/net/tipc/subscr.c > +++ b/net/tipc/subscr.c > @@ -168,9 +168,7 @@ static void tipc_subscrp_kref_release(struct kref > *kref) > struct tipc_net *tn = net_generic(sub->net, tipc_net_id); > struct tipc_subscriber *subscriber = sub->subscriber; > > - spin_lock_bh(&subscriber->lock); > atomic_dec(&tn->subscription_count); > - spin_unlock_bh(&subscriber->lock); > kfree(sub); > tipc_subscrb_put(subscriber); > } > @@ -202,11 +200,7 @@ static void tipc_subscrb_subscrp_delete(struct > tipc_subscriber *subscriber, > list_del(&sub->subscrp_list); > > tipc_nametbl_unsubscribe(sub); > - tipc_subscrp_get(sub); > - spin_unlock_bh(&subscriber->lock); > By removing the unlocking at this point couldn't you end up with a deadlock on subscriber->lock due to tipc_subscrp_delete() putting the subscrp twice (which could end up with kref == 0) and as a result calling tipc_subscrp_kref_release() which gets subscriber->lock? > tipc_subscrp_delete(sub); > - tipc_subscrp_put(sub); > - spin_lock_bh(&subscriber->lock); > > if (s) > break; > -- > 2.7.4 > > |
From: John T. <tho...@gm...> - 2017-02-20 22:30:22
|
Hi Ying, Sorry I sent that reply before signing it off. The rest of the changes look like they are doing the right things but this change looks like it has the potential to cause a deadlock. Regards, John On Tue, Feb 21, 2017 at 11:13 AM, John Thompson <tho...@gm...> wrote: > > > On Tue, Feb 21, 2017 at 12:39 AM, Ying Xue <yin...@wi...> wrote: > >> As the policy of holding subscription in subscriber time has been >> adjusted, it's unnecessary to hold subscription refcount before >> tipc_subscrp_delete() is called. As a consequence, subscriber->lock >> can be safely removed to avoid the following two deadlock scenarios: >> >> CPU1: CPU2: >> ---------- ---------------- >> tipc_nametbl_publish >> spin_lock_bh(&tn->nametbl_lock) >> tipc_nametbl_insert_publ >> tipc_nameseq_insert_publ >> tipc_subscrp_report_overlap >> tipc_subscrp_get >> tipc_subscrp_send_event >> tipc_close_conn >> tipc_subscrb_release_cb >> tipc_subscrb_delete >> tipc_subscrp_put >> tipc_subscrp_put >> tipc_subscrp_kref_release >> tipc_nametbl_unsubscribe >> spin_lock_bh(&tn->nametbl_lock) >> <<grab nametbl_lock again>> >> >> CPU1: CPU2: >> ---------- ---------------- >> tipc_nametbl_stop >> spin_lock_bh(&tn->nametbl_lock) >> tipc_purge_publications >> tipc_nameseq_remove_publ >> tipc_subscrp_report_overlap >> tipc_subscrp_get >> tipc_subscrp_send_event >> tipc_close_conn >> tipc_subscrb_release_cb >> tipc_subscrb_delete >> tipc_subscrp_put >> tipc_subscrp_put >> tipc_subscrp_kref_release >> tipc_nametbl_unsubscribe >> spin_lock_bh(&tn->nametbl_lock) >> <<grab nametbl_lock again>> >> >> Reported-by: John Thompson <tho...@gm...> >> Reported-by: Jon Maloy <jon...@er...> >> Fixes: d094c4d5f5 ("tipc: add subscription refcount to avoid invalid >> delete") >> Signed-off-by: Ying Xue <yin...@wi...> >> --- >> net/tipc/subscr.c | 6 ------ >> 1 file changed, 6 deletions(-) >> >> diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c >> index 6624232..ea67cce 100644 >> --- a/net/tipc/subscr.c >> +++ b/net/tipc/subscr.c >> @@ -168,9 +168,7 @@ static void tipc_subscrp_kref_release(struct kref >> *kref) >> struct tipc_net *tn = net_generic(sub->net, tipc_net_id); >> struct tipc_subscriber *subscriber = sub->subscriber; >> >> - spin_lock_bh(&subscriber->lock); >> atomic_dec(&tn->subscription_count); >> - spin_unlock_bh(&subscriber->lock); >> kfree(sub); >> tipc_subscrb_put(subscriber); >> } >> @@ -202,11 +200,7 @@ static void tipc_subscrb_subscrp_delete(struct >> tipc_subscriber *subscriber, >> list_del(&sub->subscrp_list); >> >> tipc_nametbl_unsubscribe(sub); >> - tipc_subscrp_get(sub); >> - spin_unlock_bh(&subscriber->lock); >> > > By removing the unlocking at this point couldn't you end up with a > deadlock on subscriber->lock > due to tipc_subscrp_delete() putting the subscrp twice (which could end up > with kref == 0) and > as a result calling tipc_subscrp_kref_release() which gets > subscriber->lock? > > > >> tipc_subscrp_delete(sub); >> - tipc_subscrp_put(sub); >> - spin_lock_bh(&subscriber->lock); >> >> if (s) >> break; >> -- >> 2.7.4 >> >> > |
From: Ying X. <yin...@wi...> - 2017-02-21 10:22:59
|
On 02/21/2017 06:13 AM, John Thompson wrote: > > > On Tue, Feb 21, 2017 at 12:39 AM, Ying Xue <yin...@wi... > <mailto:yin...@wi...>> wrote: > > As the policy of holding subscription in subscriber time has been > adjusted, it's unnecessary to hold subscription refcount before > tipc_subscrp_delete() is called. As a consequence, subscriber->lock > can be safely removed to avoid the following two deadlock scenarios: > > CPU1: CPU2: > ---------- ---------------- > tipc_nametbl_publish > spin_lock_bh(&tn->nametbl_lock) > tipc_nametbl_insert_publ > tipc_nameseq_insert_publ > tipc_subscrp_report_overlap > tipc_subscrp_get > tipc_subscrp_send_event > tipc_close_conn > tipc_subscrb_release_cb > tipc_subscrb_delete > tipc_subscrp_put > tipc_subscrp_put > tipc_subscrp_kref_release > tipc_nametbl_unsubscribe > spin_lock_bh(&tn->nametbl_lock) > <<grab nametbl_lock again>> > > CPU1: CPU2: > ---------- ---------------- > tipc_nametbl_stop > spin_lock_bh(&tn->nametbl_lock) > tipc_purge_publications > tipc_nameseq_remove_publ > tipc_subscrp_report_overlap > tipc_subscrp_get > tipc_subscrp_send_event > tipc_close_conn > tipc_subscrb_release_cb > tipc_subscrb_delete > tipc_subscrp_put > tipc_subscrp_put > tipc_subscrp_kref_release > tipc_nametbl_unsubscribe > spin_lock_bh(&tn->nametbl_lock) > <<grab nametbl_lock again>> > > Reported-by: John Thompson <tho...@gm... > <mailto:tho...@gm...>> > Reported-by: Jon Maloy <jon...@er... > <mailto:jon...@er...>> > Fixes: d094c4d5f5 ("tipc: add subscription refcount to avoid invalid > delete") > Signed-off-by: Ying Xue <yin...@wi... > <mailto:yin...@wi...>> > --- > net/tipc/subscr.c | 6 ------ > 1 file changed, 6 deletions(-) > > diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c > index 6624232..ea67cce 100644 > --- a/net/tipc/subscr.c > +++ b/net/tipc/subscr.c > @@ -168,9 +168,7 @@ static void tipc_subscrp_kref_release(struct > kref *kref) > struct tipc_net *tn = net_generic(sub->net, tipc_net_id); > struct tipc_subscriber *subscriber = sub->subscriber; > > - spin_lock_bh(&subscriber->lock); > atomic_dec(&tn->subscription_count); > - spin_unlock_bh(&subscriber->lock); > kfree(sub); > tipc_subscrb_put(subscriber); > } > @@ -202,11 +200,7 @@ static void tipc_subscrb_subscrp_delete(struct > tipc_subscriber *subscriber, > list_del(&sub->subscrp_list); > > tipc_nametbl_unsubscribe(sub); > - tipc_subscrp_get(sub); > - spin_unlock_bh(&subscriber->lock); > > > By removing the unlocking at this point couldn't you end up with a > deadlock on subscriber->lock > due to tipc_subscrp_delete() putting the subscrp twice (which could end > up with kref == 0) and > as a result calling tipc_subscrp_kref_release() which gets subscriber->lock? > I think there is no deadlock risk even if tipc_subscrp_put() will be called twice in tipc_subscrp_delete() because tipc_subscrp_kref_release() doesn't hold subscriber->lock after the change. Please take a look at tipc_subscrp_kref_release() code: static void tipc_subscrp_kref_release(struct kref *kref) { struct tipc_subscription *sub = container_of(kref, struct tipc_subscription, kref); struct tipc_net *tn = net_generic(sub->net, tipc_net_id); struct tipc_subscriber *subscriber = sub->subscriber; atomic_dec(&tn->subscription_count); kfree(sub); tipc_subscrb_put(subscriber); } > > > tipc_subscrp_delete(sub); > - tipc_subscrp_put(sub); > - spin_lock_bh(&subscriber->lock); > > if (s) > break; > -- > 2.7.4 > > |
From: Jon M. <jon...@er...> - 2017-02-21 11:11:42
|
Hi Ying, These are good design changes, that definitely should go in asap. However, I feel deeply uncomfortable with such a big change going into 'net', especially since our previous, exceptionally large, contribution now has turned out to have problems. I wonder if we could not get away with something simpler for 'net'. Looking closer at your series, it seems to me that only patches ## 1, 4, and the lock removal part of #5 are really needed to solve the problem we have at hand now. Why not merge those into one patch and deliver this to 'net', while reference count redesign parts can go into net-next ? Regards ///jon > -----Original Message----- > From: Ying Xue [mailto:yin...@wi...] > Sent: Monday, February 20, 2017 06:39 AM > To: Jon Maloy <jon...@er...>; Parthasarathy Bhuvaragan > <par...@er...>; tho...@gm... > Cc: tip...@li... > Subject: [net 0/5] solve two deadlock issues > > Commit d094c4d5f5 ("tipc: add subscription refcount to avoid invalid > delete") accidently introduce the following deadlock scenarios: > > CPU1: CPU2: > ---------- ---------------- > tipc_nametbl_publish > spin_lock_bh(&tn->nametbl_lock) > tipc_nametbl_insert_publ > tipc_nameseq_insert_publ > tipc_subscrp_report_overlap > tipc_subscrp_get > tipc_subscrp_send_event > tipc_close_conn > tipc_subscrb_release_cb > tipc_subscrb_delete > tipc_subscrp_put > tipc_subscrp_put > tipc_subscrp_kref_release > tipc_nametbl_unsubscribe > spin_lock_bh(&tn->nametbl_lock) > <<grab nametbl_lock again>> > > CPU1: CPU2: > ---------- ---------------- > tipc_nametbl_stop > spin_lock_bh(&tn->nametbl_lock) > tipc_purge_publications > tipc_nameseq_remove_publ > tipc_subscrp_report_overlap > tipc_subscrp_get > tipc_subscrp_send_event > tipc_close_conn > tipc_subscrb_release_cb > tipc_subscrb_delete > tipc_subscrp_put > tipc_subscrp_put > tipc_subscrp_kref_release > tipc_nametbl_unsubscribe > spin_lock_bh(&tn->nametbl_lock) > <<grab nametbl_lock again>> > > The root cause of two deadlocks is that we have to hold nametbl lock when > subscription is freed in tipc_subscrp_kref_release(). In order to eliminate the > need of taking nametbl lock in tipc_subscrp_kref_release(), the functions > protected by nametbl lock in tipc_subscrp_kref_release() are moved to > other places step by step in the series. > > Ying Xue (5): > tipc: advance the time of deleting subscription from > subscriber->subscrp_list > tipc: adjust the policy of holding subscription kref > tipc: adjust policy that sub->timer holds subscription kref > tipc: advance the time of calling tipc_nametbl_unsubscribe > tipc: remove unnecessary increasement of subscription refcount > > net/tipc/name_table.c | 2 ++ > net/tipc/subscr.c | 32 ++++++++++++++------------------ > net/tipc/subscr.h | 3 +++ > 3 files changed, 19 insertions(+), 18 deletions(-) > > -- > 2.7.4 |
From: Jon M. <jon...@er...> - 2017-02-21 12:42:51
|
I don't see that you remove the two premature 'return's in subcsrb_report_overlap() in your series. These are also genuine bugs that must be fixed. ///jon > -----Original Message----- > From: Jon Maloy [mailto:jon...@er...] > Sent: Tuesday, February 21, 2017 06:12 AM > To: Ying Xue <yin...@wi...>; Parthasarathy Bhuvaragan > <par...@er...>; tho...@gm... > Cc: tip...@li... > Subject: Re: [tipc-discussion] [net 0/5] solve two deadlock issues > > Hi Ying, > These are good design changes, that definitely should go in asap. However, I > feel deeply uncomfortable with such a big change going into 'net', especially > since our previous, exceptionally large, contribution now has turned out to > have problems. I wonder if we could not get away with something simpler > for 'net'. > > Looking closer at your series, it seems to me that only patches ## 1, 4, and > the lock removal part of #5 are really needed to solve the problem we have > at hand now. Why not merge those into one patch and deliver this to 'net', > while reference count redesign parts can go into net-next ? > > Regards > ///jon > > > > -----Original Message----- > > From: Ying Xue [mailto:yin...@wi...] > > Sent: Monday, February 20, 2017 06:39 AM > > To: Jon Maloy <jon...@er...>; Parthasarathy Bhuvaragan > > <par...@er...>; tho...@gm... > > Cc: tip...@li... > > Subject: [net 0/5] solve two deadlock issues > > > > Commit d094c4d5f5 ("tipc: add subscription refcount to avoid invalid > > delete") accidently introduce the following deadlock scenarios: > > > > CPU1: CPU2: > > ---------- ---------------- > > tipc_nametbl_publish > > spin_lock_bh(&tn->nametbl_lock) > > tipc_nametbl_insert_publ > > tipc_nameseq_insert_publ > > tipc_subscrp_report_overlap > > tipc_subscrp_get > > tipc_subscrp_send_event > > tipc_close_conn > > tipc_subscrb_release_cb > > tipc_subscrb_delete > > tipc_subscrp_put > > tipc_subscrp_put > > tipc_subscrp_kref_release > > tipc_nametbl_unsubscribe > > spin_lock_bh(&tn->nametbl_lock) > > <<grab nametbl_lock again>> > > > > CPU1: CPU2: > > ---------- ---------------- > > tipc_nametbl_stop > > spin_lock_bh(&tn->nametbl_lock) > > tipc_purge_publications > > tipc_nameseq_remove_publ > > tipc_subscrp_report_overlap > > tipc_subscrp_get > > tipc_subscrp_send_event > > tipc_close_conn > > tipc_subscrb_release_cb > > tipc_subscrb_delete > > tipc_subscrp_put > > tipc_subscrp_put > > tipc_subscrp_kref_release > > tipc_nametbl_unsubscribe > > spin_lock_bh(&tn->nametbl_lock) > > <<grab nametbl_lock again>> > > > > The root cause of two deadlocks is that we have to hold nametbl lock > > when subscription is freed in tipc_subscrp_kref_release(). In order to > > eliminate the need of taking nametbl lock in > > tipc_subscrp_kref_release(), the functions protected by nametbl lock > > in tipc_subscrp_kref_release() are moved to other places step by step in > the series. > > > > Ying Xue (5): > > tipc: advance the time of deleting subscription from > > subscriber->subscrp_list > > tipc: adjust the policy of holding subscription kref > > tipc: adjust policy that sub->timer holds subscription kref > > tipc: advance the time of calling tipc_nametbl_unsubscribe > > tipc: remove unnecessary increasement of subscription refcount > > > > net/tipc/name_table.c | 2 ++ > > net/tipc/subscr.c | 32 ++++++++++++++------------------ > > net/tipc/subscr.h | 3 +++ > > 3 files changed, 19 insertions(+), 18 deletions(-) > > > > -- > > 2.7.4 > > > ------------------------------------------------------------------------------ > Check out the vibrant tech community on one of the world's most engaging > tech sites, SlashDot.org! http://sdm.link/slashdot > _______________________________________________ > tipc-discussion mailing list > tip...@li... > https://lists.sourceforge.net/lists/listinfo/tipc-discussion |
From: John T. <tho...@gm...> - 2017-02-21 21:15:47
|
Sorry, I was mistaken. You are right that you have removed the sub->lock from tipc_subscrp_kref_release() and so there is no chance of a deadlock. JT On Tue, Feb 21, 2017 at 11:22 PM, Ying Xue <yin...@wi...> wrote: > On 02/21/2017 06:13 AM, John Thompson wrote: > > > > > > On Tue, Feb 21, 2017 at 12:39 AM, Ying Xue <yin...@wi... > > <mailto:yin...@wi...>> wrote: > > > > As the policy of holding subscription in subscriber time has been > > adjusted, it's unnecessary to hold subscription refcount before > > tipc_subscrp_delete() is called. As a consequence, subscriber->lock > > can be safely removed to avoid the following two deadlock scenarios: > > > > CPU1: CPU2: > > ---------- ---------------- > > tipc_nametbl_publish > > spin_lock_bh(&tn->nametbl_lock) > > tipc_nametbl_insert_publ > > tipc_nameseq_insert_publ > > tipc_subscrp_report_overlap > > tipc_subscrp_get > > tipc_subscrp_send_event > > tipc_close_conn > > tipc_subscrb_release_cb > > tipc_subscrb_delete > > tipc_subscrp_put > > tipc_subscrp_put > > tipc_subscrp_kref_release > > tipc_nametbl_unsubscribe > > spin_lock_bh(&tn->nametbl_lock) > > <<grab nametbl_lock again>> > > > > CPU1: CPU2: > > ---------- ---------------- > > tipc_nametbl_stop > > spin_lock_bh(&tn->nametbl_lock) > > tipc_purge_publications > > tipc_nameseq_remove_publ > > tipc_subscrp_report_overlap > > tipc_subscrp_get > > tipc_subscrp_send_event > > tipc_close_conn > > tipc_subscrb_release_cb > > tipc_subscrb_delete > > tipc_subscrp_put > > tipc_subscrp_put > > tipc_subscrp_kref_release > > tipc_nametbl_unsubscribe > > spin_lock_bh(&tn->nametbl_lock) > > <<grab nametbl_lock again>> > > > > Reported-by: John Thompson <tho...@gm... > > <mailto:tho...@gm...>> > > Reported-by: Jon Maloy <jon...@er... > > <mailto:jon...@er...>> > > Fixes: d094c4d5f5 ("tipc: add subscription refcount to avoid invalid > > delete") > > Signed-off-by: Ying Xue <yin...@wi... > > <mailto:yin...@wi...>> > > --- > > net/tipc/subscr.c | 6 ------ > > 1 file changed, 6 deletions(-) > > > > diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c > > index 6624232..ea67cce 100644 > > --- a/net/tipc/subscr.c > > +++ b/net/tipc/subscr.c > > @@ -168,9 +168,7 @@ static void tipc_subscrp_kref_release(struct > > kref *kref) > > struct tipc_net *tn = net_generic(sub->net, tipc_net_id); > > struct tipc_subscriber *subscriber = sub->subscriber; > > > > - spin_lock_bh(&subscriber->lock); > > atomic_dec(&tn->subscription_count); > > - spin_unlock_bh(&subscriber->lock); > > kfree(sub); > > tipc_subscrb_put(subscriber); > > } > > @@ -202,11 +200,7 @@ static void tipc_subscrb_subscrp_delete(struct > > tipc_subscriber *subscriber, > > list_del(&sub->subscrp_list); > > > > tipc_nametbl_unsubscribe(sub); > > - tipc_subscrp_get(sub); > > - spin_unlock_bh(&subscriber->lock); > > > > > > By removing the unlocking at this point couldn't you end up with a > > deadlock on subscriber->lock > > due to tipc_subscrp_delete() putting the subscrp twice (which could end > > up with kref == 0) and > > as a result calling tipc_subscrp_kref_release() which gets > subscriber->lock? > > > > I think there is no deadlock risk even if tipc_subscrp_put() will be > called twice in tipc_subscrp_delete() because > tipc_subscrp_kref_release() doesn't hold subscriber->lock after the change. > > Please take a look at tipc_subscrp_kref_release() code: > static void tipc_subscrp_kref_release(struct kref *kref) > { > struct tipc_subscription *sub = container_of(kref, > struct > tipc_subscription, > kref); > struct tipc_net *tn = net_generic(sub->net, tipc_net_id); > struct tipc_subscriber *subscriber = sub->subscriber; > > atomic_dec(&tn->subscription_count); > kfree(sub); > tipc_subscrb_put(subscriber); > } > > > > > > > tipc_subscrp_delete(sub); > > - tipc_subscrp_put(sub); > > - spin_lock_bh(&subscriber->lock); > > > > if (s) > > break; > > -- > > 2.7.4 > > > > > |
From: John T. <tho...@gm...> - 2017-02-21 21:18:07
|
Patch #2 removes the tipc_subscrp_get() and _put() from tipc_subscrp_report_overlap(). This prevents the problem of the early returns. JT On Wed, Feb 22, 2017 at 1:42 AM, Jon Maloy <jon...@er...> wrote: > I don't see that you remove the two premature 'return's in > subcsrb_report_overlap() in your series. These are also genuine bugs that > must be fixed. > > ///jon > > > > -----Original Message----- > > From: Jon Maloy [mailto:jon...@er...] > > Sent: Tuesday, February 21, 2017 06:12 AM > > To: Ying Xue <yin...@wi...>; Parthasarathy Bhuvaragan > > <par...@er...>; tho...@gm... > > Cc: tip...@li... > > Subject: Re: [tipc-discussion] [net 0/5] solve two deadlock issues > > > > Hi Ying, > > These are good design changes, that definitely should go in asap. > However, I > > feel deeply uncomfortable with such a big change going into 'net', > especially > > since our previous, exceptionally large, contribution now has turned out > to > > have problems. I wonder if we could not get away with something simpler > > for 'net'. > > > > Looking closer at your series, it seems to me that only patches ## 1, 4, > and > > the lock removal part of #5 are really needed to solve the problem we > have > > at hand now. Why not merge those into one patch and deliver this to > 'net', > > while reference count redesign parts can go into net-next ? > > > > Regards > > ///jon > > > > > > > -----Original Message----- > > > From: Ying Xue [mailto:yin...@wi...] > > > Sent: Monday, February 20, 2017 06:39 AM > > > To: Jon Maloy <jon...@er...>; Parthasarathy Bhuvaragan > > > <par...@er...>; tho...@gm... > > > Cc: tip...@li... > > > Subject: [net 0/5] solve two deadlock issues > > > > > > Commit d094c4d5f5 ("tipc: add subscription refcount to avoid invalid > > > delete") accidently introduce the following deadlock scenarios: > > > > > > CPU1: CPU2: > > > ---------- ---------------- > > > tipc_nametbl_publish > > > spin_lock_bh(&tn->nametbl_lock) > > > tipc_nametbl_insert_publ > > > tipc_nameseq_insert_publ > > > tipc_subscrp_report_overlap > > > tipc_subscrp_get > > > tipc_subscrp_send_event > > > tipc_close_conn > > > tipc_subscrb_release_cb > > > tipc_subscrb_delete > > > tipc_subscrp_put > > > tipc_subscrp_put > > > tipc_subscrp_kref_release > > > tipc_nametbl_unsubscribe > > > spin_lock_bh(&tn->nametbl_lock) > > > <<grab nametbl_lock again>> > > > > > > CPU1: CPU2: > > > ---------- ---------------- > > > tipc_nametbl_stop > > > spin_lock_bh(&tn->nametbl_lock) > > > tipc_purge_publications > > > tipc_nameseq_remove_publ > > > tipc_subscrp_report_overlap > > > tipc_subscrp_get > > > tipc_subscrp_send_event > > > tipc_close_conn > > > tipc_subscrb_release_cb > > > tipc_subscrb_delete > > > tipc_subscrp_put > > > tipc_subscrp_put > > > tipc_subscrp_kref_release > > > tipc_nametbl_unsubscribe > > > spin_lock_bh(&tn->nametbl_lock) > > > <<grab nametbl_lock again>> > > > > > > The root cause of two deadlocks is that we have to hold nametbl lock > > > when subscription is freed in tipc_subscrp_kref_release(). In order to > > > eliminate the need of taking nametbl lock in > > > tipc_subscrp_kref_release(), the functions protected by nametbl lock > > > in tipc_subscrp_kref_release() are moved to other places step by step > in > > the series. > > > > > > Ying Xue (5): > > > tipc: advance the time of deleting subscription from > > > subscriber->subscrp_list > > > tipc: adjust the policy of holding subscription kref > > > tipc: adjust policy that sub->timer holds subscription kref > > > tipc: advance the time of calling tipc_nametbl_unsubscribe > > > tipc: remove unnecessary increasement of subscription refcount > > > > > > net/tipc/name_table.c | 2 ++ > > > net/tipc/subscr.c | 32 ++++++++++++++------------------ > > > net/tipc/subscr.h | 3 +++ > > > 3 files changed, 19 insertions(+), 18 deletions(-) > > > > > > -- > > > 2.7.4 > > > > > > ------------------------------------------------------------ > ------------------ > > Check out the vibrant tech community on one of the world's most engaging > > tech sites, SlashDot.org! http://sdm.link/slashdot > > _______________________________________________ > > tipc-discussion mailing list > > tip...@li... > > https://lists.sourceforge.net/lists/listinfo/tipc-discussion > |
From: Jon M. <jon...@er...> - 2017-02-21 22:56:52
|
Hi John. Yes you are right. But I still would prefer a condensed patch where we don’t touch the refcounts when this goes into ‘net’. I am awaiting a comment from Ying. ///jon From: John Thompson [mailto:tho...@gm...] Sent: Tuesday, February 21, 2017 04:18 PM To: Jon Maloy <jon...@er...> Cc: Ying Xue <yin...@wi...>; Parthasarathy Bhuvaragan <par...@er...>; tip...@li... Subject: Re: [net 0/5] solve two deadlock issues Patch #2 removes the tipc_subscrp_get() and _put() from tipc_subscrp_report_overlap(). This prevents the problem of the early returns. JT On Wed, Feb 22, 2017 at 1:42 AM, Jon Maloy <jon...@er...<mailto:jon...@er...>> wrote: I don't see that you remove the two premature 'return's in subcsrb_report_overlap() in your series. These are also genuine bugs that must be fixed. ///jon > -----Original Message----- > From: Jon Maloy [mailto:jon...@er...<mailto:jon...@er...>] > Sent: Tuesday, February 21, 2017 06:12 AM > To: Ying Xue <yin...@wi...<mailto:yin...@wi...>>; Parthasarathy Bhuvaragan > <par...@er...<mailto:par...@er...>>; tho...@gm...<mailto:tho...@gm...> > Cc: tip...@li...<mailto:tip...@li...> > Subject: Re: [tipc-discussion] [net 0/5] solve two deadlock issues > > Hi Ying, > These are good design changes, that definitely should go in asap. However, I > feel deeply uncomfortable with such a big change going into 'net', especially > since our previous, exceptionally large, contribution now has turned out to > have problems. I wonder if we could not get away with something simpler > for 'net'. > > Looking closer at your series, it seems to me that only patches ## 1, 4, and > the lock removal part of #5 are really needed to solve the problem we have > at hand now. Why not merge those into one patch and deliver this to 'net', > while reference count redesign parts can go into net-next ? > > Regards > ///jon > > > > -----Original Message----- > > From: Ying Xue [mailto:yin...@wi...<mailto:yin...@wi...>] > > Sent: Monday, February 20, 2017 06:39 AM > > To: Jon Maloy <jon...@er...<mailto:jon...@er...>>; Parthasarathy Bhuvaragan > > <par...@er...<mailto:par...@er...>>; tho...@gm...<mailto:tho...@gm...> > > Cc: tip...@li...<mailto:tip...@li...> > > Subject: [net 0/5] solve two deadlock issues > > > > Commit d094c4d5f5 ("tipc: add subscription refcount to avoid invalid > > delete") accidently introduce the following deadlock scenarios: > > > > CPU1: CPU2: > > ---------- ---------------- > > tipc_nametbl_publish > > spin_lock_bh(&tn->nametbl_lock) > > tipc_nametbl_insert_publ > > tipc_nameseq_insert_publ > > tipc_subscrp_report_overlap > > tipc_subscrp_get > > tipc_subscrp_send_event > > tipc_close_conn > > tipc_subscrb_release_cb > > tipc_subscrb_delete > > tipc_subscrp_put > > tipc_subscrp_put > > tipc_subscrp_kref_release > > tipc_nametbl_unsubscribe > > spin_lock_bh(&tn->nametbl_lock) > > <<grab nametbl_lock again>> > > > > CPU1: CPU2: > > ---------- ---------------- > > tipc_nametbl_stop > > spin_lock_bh(&tn->nametbl_lock) > > tipc_purge_publications > > tipc_nameseq_remove_publ > > tipc_subscrp_report_overlap > > tipc_subscrp_get > > tipc_subscrp_send_event > > tipc_close_conn > > tipc_subscrb_release_cb > > tipc_subscrb_delete > > tipc_subscrp_put > > tipc_subscrp_put > > tipc_subscrp_kref_release > > tipc_nametbl_unsubscribe > > spin_lock_bh(&tn->nametbl_lock) > > <<grab nametbl_lock again>> > > > > The root cause of two deadlocks is that we have to hold nametbl lock > > when subscription is freed in tipc_subscrp_kref_release(). In order to > > eliminate the need of taking nametbl lock in > > tipc_subscrp_kref_release(), the functions protected by nametbl lock > > in tipc_subscrp_kref_release() are moved to other places step by step in > the series. > > > > Ying Xue (5): > > tipc: advance the time of deleting subscription from > > subscriber->subscrp_list > > tipc: adjust the policy of holding subscription kref > > tipc: adjust policy that sub->timer holds subscription kref > > tipc: advance the time of calling tipc_nametbl_unsubscribe > > tipc: remove unnecessary increasement of subscription refcount > > > > net/tipc/name_table.c | 2 ++ > > net/tipc/subscr.c | 32 ++++++++++++++------------------ > > net/tipc/subscr.h | 3 +++ > > 3 files changed, 19 insertions(+), 18 deletions(-) > > > > -- > > 2.7.4 > > > ------------------------------------------------------------------------------ > Check out the vibrant tech community on one of the world's most engaging > tech sites, SlashDot.org! http://sdm.link/slashdot > _______________________________________________ > tipc-discussion mailing list > tip...@li...<mailto:tip...@li...> > https://lists.sourceforge.net/lists/listinfo/tipc-discussion |
From: John T. <tho...@gm...> - 2017-02-22 08:51:33
|
I am running a test overnight on the patches as they are and will update later on. JT On Wed, Feb 22, 2017 at 11:56 AM, Jon Maloy <jon...@er...> wrote: > Hi John. > > Yes you are right. But I still would prefer a condensed patch where we > don’t touch the refcounts when this goes into ‘net’. I am awaiting a > comment from Ying. > > > > ///jon > > > > > > *From:* John Thompson [mailto:tho...@gm...] > *Sent:* Tuesday, February 21, 2017 04:18 PM > *To:* Jon Maloy <jon...@er...> > *Cc:* Ying Xue <yin...@wi...>; Parthasarathy Bhuvaragan < > par...@er...>; tipc-discussion@lists. > sourceforge.net > *Subject:* Re: [net 0/5] solve two deadlock issues > > > > Patch #2 removes the tipc_subscrp_get() and _put() > from tipc_subscrp_report_overlap(). > > This prevents the problem of the early returns. > > JT > > > > > > On Wed, Feb 22, 2017 at 1:42 AM, Jon Maloy <jon...@er...> wrote: > > I don't see that you remove the two premature 'return's in > subcsrb_report_overlap() in your series. These are also genuine bugs that > must be fixed. > > ///jon > > > > -----Original Message----- > > From: Jon Maloy [mailto:jon...@er...] > > Sent: Tuesday, February 21, 2017 06:12 AM > > To: Ying Xue <yin...@wi...>; Parthasarathy Bhuvaragan > > <par...@er...>; tho...@gm... > > Cc: tip...@li... > > > Subject: Re: [tipc-discussion] [net 0/5] solve two deadlock issues > > > > Hi Ying, > > These are good design changes, that definitely should go in asap. > However, I > > feel deeply uncomfortable with such a big change going into 'net', > especially > > since our previous, exceptionally large, contribution now has turned out > to > > have problems. I wonder if we could not get away with something simpler > > for 'net'. > > > > Looking closer at your series, it seems to me that only patches ## 1, 4, > and > > the lock removal part of #5 are really needed to solve the problem we > have > > at hand now. Why not merge those into one patch and deliver this to > 'net', > > while reference count redesign parts can go into net-next ? > > > > Regards > > ///jon > > > > > > > -----Original Message----- > > > From: Ying Xue [mailto:yin...@wi...] > > > Sent: Monday, February 20, 2017 06:39 AM > > > To: Jon Maloy <jon...@er...>; Parthasarathy Bhuvaragan > > > <par...@er...>; tho...@gm... > > > Cc: tip...@li... > > > Subject: [net 0/5] solve two deadlock issues > > > > > > Commit d094c4d5f5 ("tipc: add subscription refcount to avoid invalid > > > delete") accidently introduce the following deadlock scenarios: > > > > > > CPU1: CPU2: > > > ---------- ---------------- > > > tipc_nametbl_publish > > > spin_lock_bh(&tn->nametbl_lock) > > > tipc_nametbl_insert_publ > > > tipc_nameseq_insert_publ > > > tipc_subscrp_report_overlap > > > tipc_subscrp_get > > > tipc_subscrp_send_event > > > tipc_close_conn > > > tipc_subscrb_release_cb > > > tipc_subscrb_delete > > > tipc_subscrp_put > > > tipc_subscrp_put > > > tipc_subscrp_kref_release > > > tipc_nametbl_unsubscribe > > > spin_lock_bh(&tn->nametbl_lock) > > > <<grab nametbl_lock again>> > > > > > > CPU1: CPU2: > > > ---------- ---------------- > > > tipc_nametbl_stop > > > spin_lock_bh(&tn->nametbl_lock) > > > tipc_purge_publications > > > tipc_nameseq_remove_publ > > > tipc_subscrp_report_overlap > > > tipc_subscrp_get > > > tipc_subscrp_send_event > > > tipc_close_conn > > > tipc_subscrb_release_cb > > > tipc_subscrb_delete > > > tipc_subscrp_put > > > tipc_subscrp_put > > > tipc_subscrp_kref_release > > > tipc_nametbl_unsubscribe > > > spin_lock_bh(&tn->nametbl_lock) > > > <<grab nametbl_lock again>> > > > > > > The root cause of two deadlocks is that we have to hold nametbl lock > > > when subscription is freed in tipc_subscrp_kref_release(). In order to > > > eliminate the need of taking nametbl lock in > > > tipc_subscrp_kref_release(), the functions protected by nametbl lock > > > in tipc_subscrp_kref_release() are moved to other places step by step > in > > the series. > > > > > > Ying Xue (5): > > > tipc: advance the time of deleting subscription from > > > subscriber->subscrp_list > > > tipc: adjust the policy of holding subscription kref > > > tipc: adjust policy that sub->timer holds subscription kref > > > tipc: advance the time of calling tipc_nametbl_unsubscribe > > > tipc: remove unnecessary increasement of subscription refcount > > > > > > net/tipc/name_table.c | 2 ++ > > > net/tipc/subscr.c | 32 ++++++++++++++------------------ > > > net/tipc/subscr.h | 3 +++ > > > 3 files changed, 19 insertions(+), 18 deletions(-) > > > > > > -- > > > 2.7.4 > > > > > > > ------------------------------------------------------------ > ------------------ > > Check out the vibrant tech community on one of the world's most engaging > > tech sites, SlashDot.org! http://sdm.link/slashdot > > _______________________________________________ > > tipc-discussion mailing list > > tip...@li... > > https://lists.sourceforge.net/lists/listinfo/tipc-discussion > > > |
From: Ying X. <yin...@wi...> - 2017-02-22 11:43:44
|
On 02/22/2017 04:51 PM, John Thompson wrote: > I am running a test overnight on the patches as they are and will update > later on. Thank you very much, and I am waiting for your test result. Thanks, Ying > JT > |
From: Xue, Y. <Yin...@wi...> - 2017-02-22 11:42:43
|
Hi Jon, I understood your concern. I have checked the possibility of merging patch #1, #4 and #5 as one. However, just merging the three patch is insufficient, and at least #2 seems necessary too, otherwise, another deadlock still exists due to two premature 'return's in subcsrb_report_overlap(). Even if we merged them as one, it will lose my initial purpose of dividing the series as so small patches. Although each patch is made a small change, it's often related to a policy adjustment of locking or holding refcount. Moreover, as our locking policy associated with topserver becomes complex, I want to use the comments in each patch header to record what policy has been adjusted. In the future, the information can guide whether our changes comply with the adjusted policy or not. In fact, all changes contained in the series is not big. But if we merged them as one, all useful messages will be lost forever. Additionally, "net-next" tree reaches 4.10-rc8, and "net" tree is 4.10-rc7 now. I saw today there was one developer who submitted a patch to net-next and David also accepted it. However, if John's testing proved the series is okay tomorrow, probably I can send the series to net-next tomorrow. Even for the worst case, we cannot submit the series until net-next is open again. But I have checked nobody would maintain 4.10 as a stable version. So even if there is a big long time gap, it seems not to cause a series issue. Regards, Ying -----Original Message----- From: Jon Maloy [mailto:jon...@er...] Sent: Tuesday, February 21, 2017 7:12 PM To: Xue, Ying; Parthasarathy Bhuvaragan; tho...@gm... Cc: tip...@li... Subject: RE: [net 0/5] solve two deadlock issues Hi Ying, These are good design changes, that definitely should go in asap. However, I feel deeply uncomfortable with such a big change going into 'net', especially since our previous, exceptionally large, contribution now has turned out to have problems. I wonder if we could not get away with something simpler for 'net'. Looking closer at your series, it seems to me that only patches ## 1, 4, and the lock removal part of #5 are really needed to solve the problem we have at hand now. Why not merge those into one patch and deliver this to 'net', while reference count redesign parts can go into net-next ? Regards ///jon > -----Original Message----- > From: Ying Xue [mailto:yin...@wi...] > Sent: Monday, February 20, 2017 06:39 AM > To: Jon Maloy <jon...@er...>; Parthasarathy Bhuvaragan > <par...@er...>; tho...@gm... > Cc: tip...@li... > Subject: [net 0/5] solve two deadlock issues > > Commit d094c4d5f5 ("tipc: add subscription refcount to avoid invalid > delete") accidently introduce the following deadlock scenarios: > > CPU1: CPU2: > ---------- ---------------- > tipc_nametbl_publish > spin_lock_bh(&tn->nametbl_lock) > tipc_nametbl_insert_publ > tipc_nameseq_insert_publ > tipc_subscrp_report_overlap > tipc_subscrp_get > tipc_subscrp_send_event > tipc_close_conn > tipc_subscrb_release_cb > tipc_subscrb_delete > tipc_subscrp_put > tipc_subscrp_put > tipc_subscrp_kref_release > tipc_nametbl_unsubscribe > spin_lock_bh(&tn->nametbl_lock) > <<grab nametbl_lock again>> > > CPU1: CPU2: > ---------- ---------------- > tipc_nametbl_stop > spin_lock_bh(&tn->nametbl_lock) > tipc_purge_publications > tipc_nameseq_remove_publ > tipc_subscrp_report_overlap > tipc_subscrp_get > tipc_subscrp_send_event > tipc_close_conn > tipc_subscrb_release_cb > tipc_subscrb_delete > tipc_subscrp_put > tipc_subscrp_put > tipc_subscrp_kref_release > tipc_nametbl_unsubscribe > spin_lock_bh(&tn->nametbl_lock) > <<grab nametbl_lock again>> > > The root cause of two deadlocks is that we have to hold nametbl lock > when subscription is freed in tipc_subscrp_kref_release(). In order to > eliminate the need of taking nametbl lock in > tipc_subscrp_kref_release(), the functions protected by nametbl lock > in tipc_subscrp_kref_release() are moved to other places step by step in the series. > > Ying Xue (5): > tipc: advance the time of deleting subscription from > subscriber->subscrp_list > tipc: adjust the policy of holding subscription kref > tipc: adjust policy that sub->timer holds subscription kref > tipc: advance the time of calling tipc_nametbl_unsubscribe > tipc: remove unnecessary increasement of subscription refcount > > net/tipc/name_table.c | 2 ++ > net/tipc/subscr.c | 32 ++++++++++++++------------------ > net/tipc/subscr.h | 3 +++ > 3 files changed, 19 insertions(+), 18 deletions(-) > > -- > 2.7.4 |
From: Ying X. <yin...@wi...> - 2017-02-22 11:46:58
|
At least, net-next tree is still open as David is reviewing patches submitted to net-next. Hope we have a window to submit the series to net-next. Thanks, Ying On 02/22/2017 07:42 PM, Xue, Ying wrote: > Hi Jon, > > I understood your concern. > > I have checked the possibility of merging patch #1, #4 and #5 as one. However, just merging the three patch is insufficient, and at least #2 seems necessary too, otherwise, another deadlock still exists due to two premature 'return's in subcsrb_report_overlap(). Even if we merged them as one, it will lose my initial purpose of dividing the series as so small patches. Although each patch is made a small change, it's often related to a policy adjustment of locking or holding refcount. Moreover, as our locking policy associated with topserver becomes complex, I want to use the comments in each patch header to record what policy has been adjusted. In the future, the information can guide whether our changes comply with the adjusted policy or not. > > In fact, all changes contained in the series is not big. But if we merged them as one, all useful messages will be lost forever. > > Additionally, "net-next" tree reaches 4.10-rc8, and "net" tree is 4.10-rc7 now. I saw today there was one developer who submitted a patch to net-next and David also accepted it. However, if John's testing proved the series is okay tomorrow, probably I can send the series to net-next tomorrow. Even for the worst case, we cannot submit the series until net-next is open again. But I have checked nobody would maintain 4.10 as a stable version. So even if there is a big long time gap, it seems not to cause a series issue. > > Regards, > Ying > > -----Original Message----- > From: Jon Maloy [mailto:jon...@er...] > Sent: Tuesday, February 21, 2017 7:12 PM > To: Xue, Ying; Parthasarathy Bhuvaragan; tho...@gm... > Cc: tip...@li... > Subject: RE: [net 0/5] solve two deadlock issues > > Hi Ying, > These are good design changes, that definitely should go in asap. However, I feel deeply uncomfortable with such a big change going into 'net', especially since our previous, exceptionally large, contribution now has turned out to have problems. I wonder if we could not get away with something simpler for 'net'. > > Looking closer at your series, it seems to me that only patches ## 1, 4, and the lock removal part of #5 are really needed to solve the problem we have at hand now. Why not merge those into one patch and deliver this to 'net', while reference count redesign parts can go into net-next ? > > Regards > ///jon > > >> -----Original Message----- >> From: Ying Xue [mailto:yin...@wi...] >> Sent: Monday, February 20, 2017 06:39 AM >> To: Jon Maloy <jon...@er...>; Parthasarathy Bhuvaragan >> <par...@er...>; tho...@gm... >> Cc: tip...@li... >> Subject: [net 0/5] solve two deadlock issues >> >> Commit d094c4d5f5 ("tipc: add subscription refcount to avoid invalid >> delete") accidently introduce the following deadlock scenarios: >> >> CPU1: CPU2: >> ---------- ---------------- >> tipc_nametbl_publish >> spin_lock_bh(&tn->nametbl_lock) >> tipc_nametbl_insert_publ >> tipc_nameseq_insert_publ >> tipc_subscrp_report_overlap >> tipc_subscrp_get >> tipc_subscrp_send_event >> tipc_close_conn >> tipc_subscrb_release_cb >> tipc_subscrb_delete >> tipc_subscrp_put >> tipc_subscrp_put >> tipc_subscrp_kref_release >> tipc_nametbl_unsubscribe >> spin_lock_bh(&tn->nametbl_lock) >> <<grab nametbl_lock again>> >> >> CPU1: CPU2: >> ---------- ---------------- >> tipc_nametbl_stop >> spin_lock_bh(&tn->nametbl_lock) >> tipc_purge_publications >> tipc_nameseq_remove_publ >> tipc_subscrp_report_overlap >> tipc_subscrp_get >> tipc_subscrp_send_event >> tipc_close_conn >> tipc_subscrb_release_cb >> tipc_subscrb_delete >> tipc_subscrp_put >> tipc_subscrp_put >> tipc_subscrp_kref_release >> tipc_nametbl_unsubscribe >> spin_lock_bh(&tn->nametbl_lock) >> <<grab nametbl_lock again>> >> >> The root cause of two deadlocks is that we have to hold nametbl lock >> when subscription is freed in tipc_subscrp_kref_release(). In order to >> eliminate the need of taking nametbl lock in >> tipc_subscrp_kref_release(), the functions protected by nametbl lock >> in tipc_subscrp_kref_release() are moved to other places step by step in the series. >> >> Ying Xue (5): >> tipc: advance the time of deleting subscription from >> subscriber->subscrp_list >> tipc: adjust the policy of holding subscription kref >> tipc: adjust policy that sub->timer holds subscription kref >> tipc: advance the time of calling tipc_nametbl_unsubscribe >> tipc: remove unnecessary increasement of subscription refcount >> >> net/tipc/name_table.c | 2 ++ >> net/tipc/subscr.c | 32 ++++++++++++++------------------ >> net/tipc/subscr.h | 3 +++ >> 3 files changed, 19 insertions(+), 18 deletions(-) >> >> -- >> 2.7.4 > > > ------------------------------------------------------------------------------ > Check out the vibrant tech community on one of the world's most > engaging tech sites, SlashDot.org! http://sdm.link/slashdot > _______________________________________________ > tipc-discussion mailing list > tip...@li... > https://lists.sourceforge.net/lists/listinfo/tipc-discussion > |
From: Jon M. <jon...@er...> - 2017-02-22 12:29:59
|
Ok. So go for it to net-next. Reviewed-by: Jon ///jon > -----Original Message----- > From: Ying Xue [mailto:yin...@wi...] > Sent: Wednesday, February 22, 2017 06:47 AM > To: Jon Maloy <jon...@er...>; Parthasarathy Bhuvaragan > <par...@er...>; tho...@gm... > Cc: tip...@li... > Subject: Re: [tipc-discussion] [net 0/5] solve two deadlock issues > > At least, net-next tree is still open as David is reviewing patches submitted to > net-next. > > Hope we have a window to submit the series to net-next. > > Thanks, > Ying > > On 02/22/2017 07:42 PM, Xue, Ying wrote: > > Hi Jon, > > > > I understood your concern. > > > > I have checked the possibility of merging patch #1, #4 and #5 as one. > However, just merging the three patch is insufficient, and at least #2 seems > necessary too, otherwise, another deadlock still exists due to two premature > 'return's in subcsrb_report_overlap(). Even if we merged them as one, it will > lose my initial purpose of dividing the series as so small patches. Although > each patch is made a small change, it's often related to a policy adjustment of > locking or holding refcount. Moreover, as our locking policy associated with > topserver becomes complex, I want to use the comments in each patch > header to record what policy has been adjusted. In the future, the > information can guide whether our changes comply with the adjusted policy > or not. > > > > In fact, all changes contained in the series is not big. But if we merged them > as one, all useful messages will be lost forever. > > > > Additionally, "net-next" tree reaches 4.10-rc8, and "net" tree is 4.10-rc7 > now. I saw today there was one developer who submitted a patch to net- > next and David also accepted it. However, if John's testing proved the series > is okay tomorrow, probably I can send the series to net-next tomorrow. Even > for the worst case, we cannot submit the series until net-next is open again. > But I have checked nobody would maintain 4.10 as a stable version. So even > if there is a big long time gap, it seems not to cause a series issue. > > > > Regards, > > Ying > > > > -----Original Message----- > > From: Jon Maloy [mailto:jon...@er...] > > Sent: Tuesday, February 21, 2017 7:12 PM > > To: Xue, Ying; Parthasarathy Bhuvaragan; tho...@gm... > > Cc: tip...@li... > > Subject: RE: [net 0/5] solve two deadlock issues > > > > Hi Ying, > > These are good design changes, that definitely should go in asap. However, > I feel deeply uncomfortable with such a big change going into 'net', especially > since our previous, exceptionally large, contribution now has turned out to > have problems. I wonder if we could not get away with something simpler > for 'net'. > > > > Looking closer at your series, it seems to me that only patches ## 1, 4, and > the lock removal part of #5 are really needed to solve the problem we have > at hand now. Why not merge those into one patch and deliver this to 'net', > while reference count redesign parts can go into net-next ? > > > > Regards > > ///jon > > > > > >> -----Original Message----- > >> From: Ying Xue [mailto:yin...@wi...] > >> Sent: Monday, February 20, 2017 06:39 AM > >> To: Jon Maloy <jon...@er...>; Parthasarathy Bhuvaragan > >> <par...@er...>; tho...@gm... > >> Cc: tip...@li... > >> Subject: [net 0/5] solve two deadlock issues > >> > >> Commit d094c4d5f5 ("tipc: add subscription refcount to avoid invalid > >> delete") accidently introduce the following deadlock scenarios: > >> > >> CPU1: CPU2: > >> ---------- ---------------- > >> tipc_nametbl_publish > >> spin_lock_bh(&tn->nametbl_lock) > >> tipc_nametbl_insert_publ > >> tipc_nameseq_insert_publ > >> tipc_subscrp_report_overlap > >> tipc_subscrp_get > >> tipc_subscrp_send_event > >> tipc_close_conn > >> tipc_subscrb_release_cb > >> tipc_subscrb_delete > >> tipc_subscrp_put > >> tipc_subscrp_put > >> tipc_subscrp_kref_release > >> tipc_nametbl_unsubscribe > >> spin_lock_bh(&tn->nametbl_lock) > >> <<grab nametbl_lock again>> > >> > >> CPU1: CPU2: > >> ---------- ---------------- > >> tipc_nametbl_stop > >> spin_lock_bh(&tn->nametbl_lock) > >> tipc_purge_publications > >> tipc_nameseq_remove_publ > >> tipc_subscrp_report_overlap > >> tipc_subscrp_get > >> tipc_subscrp_send_event > >> tipc_close_conn > >> tipc_subscrb_release_cb > >> tipc_subscrb_delete > >> tipc_subscrp_put > >> tipc_subscrp_put > >> tipc_subscrp_kref_release > >> tipc_nametbl_unsubscribe > >> spin_lock_bh(&tn->nametbl_lock) > >> <<grab nametbl_lock again>> > >> > >> The root cause of two deadlocks is that we have to hold nametbl lock > >> when subscription is freed in tipc_subscrp_kref_release(). In order > >> to eliminate the need of taking nametbl lock in > >> tipc_subscrp_kref_release(), the functions protected by nametbl lock > >> in tipc_subscrp_kref_release() are moved to other places step by step in > the series. > >> > >> Ying Xue (5): > >> tipc: advance the time of deleting subscription from > >> subscriber->subscrp_list > >> tipc: adjust the policy of holding subscription kref > >> tipc: adjust policy that sub->timer holds subscription kref > >> tipc: advance the time of calling tipc_nametbl_unsubscribe > >> tipc: remove unnecessary increasement of subscription refcount > >> > >> net/tipc/name_table.c | 2 ++ > >> net/tipc/subscr.c | 32 ++++++++++++++------------------ > >> net/tipc/subscr.h | 3 +++ > >> 3 files changed, 19 insertions(+), 18 deletions(-) > >> > >> -- > >> 2.7.4 > > > > > > ---------------------------------------------------------------------- > > -------- Check out the vibrant tech community on one of the world's > > most engaging tech sites, SlashDot.org! http://sdm.link/slashdot > > _______________________________________________ > > tipc-discussion mailing list > > tip...@li... > > https://lists.sourceforge.net/lists/listinfo/tipc-discussion > > |
From: John T. <tho...@gm...> - 2017-02-22 20:09:39
|
My overnight testing has not shown any problems with this patch set. JT On Thu, Feb 23, 2017 at 1:29 AM, Jon Maloy <jon...@er...> wrote: > Ok. So go for it to net-next. > Reviewed-by: Jon > ///jon > > > -----Original Message----- > > From: Ying Xue [mailto:yin...@wi...] > > Sent: Wednesday, February 22, 2017 06:47 AM > > To: Jon Maloy <jon...@er...>; Parthasarathy Bhuvaragan > > <par...@er...>; tho...@gm... > > Cc: tip...@li... > > Subject: Re: [tipc-discussion] [net 0/5] solve two deadlock issues > > > > At least, net-next tree is still open as David is reviewing patches > submitted to > > net-next. > > > > Hope we have a window to submit the series to net-next. > > > > Thanks, > > Ying > > > > On 02/22/2017 07:42 PM, Xue, Ying wrote: > > > Hi Jon, > > > > > > I understood your concern. > > > > > > I have checked the possibility of merging patch #1, #4 and #5 as one. > > However, just merging the three patch is insufficient, and at least #2 > seems > > necessary too, otherwise, another deadlock still exists due to two > premature > > 'return's in subcsrb_report_overlap(). Even if we merged them as one, it > will > > lose my initial purpose of dividing the series as so small patches. > Although > > each patch is made a small change, it's often related to a policy > adjustment of > > locking or holding refcount. Moreover, as our locking policy associated > with > > topserver becomes complex, I want to use the comments in each patch > > header to record what policy has been adjusted. In the future, the > > information can guide whether our changes comply with the adjusted policy > > or not. > > > > > > In fact, all changes contained in the series is not big. But if we > merged them > > as one, all useful messages will be lost forever. > > > > > > Additionally, "net-next" tree reaches 4.10-rc8, and "net" tree is > 4.10-rc7 > > now. I saw today there was one developer who submitted a patch to net- > > next and David also accepted it. However, if John's testing proved the > series > > is okay tomorrow, probably I can send the series to net-next tomorrow. > Even > > for the worst case, we cannot submit the series until net-next is open > again. > > But I have checked nobody would maintain 4.10 as a stable version. So > even > > if there is a big long time gap, it seems not to cause a series issue. > > > > > > Regards, > > > Ying > > > > > > -----Original Message----- > > > From: Jon Maloy [mailto:jon...@er...] > > > Sent: Tuesday, February 21, 2017 7:12 PM > > > To: Xue, Ying; Parthasarathy Bhuvaragan; tho...@gm... > > > Cc: tip...@li... > > > Subject: RE: [net 0/5] solve two deadlock issues > > > > > > Hi Ying, > > > These are good design changes, that definitely should go in asap. > However, > > I feel deeply uncomfortable with such a big change going into 'net', > especially > > since our previous, exceptionally large, contribution now has turned out > to > > have problems. I wonder if we could not get away with something simpler > > for 'net'. > > > > > > Looking closer at your series, it seems to me that only patches ## 1, > 4, and > > the lock removal part of #5 are really needed to solve the problem we > have > > at hand now. Why not merge those into one patch and deliver this to > 'net', > > while reference count redesign parts can go into net-next ? > > > > > > Regards > > > ///jon > > > > > > > > >> -----Original Message----- > > >> From: Ying Xue [mailto:yin...@wi...] > > >> Sent: Monday, February 20, 2017 06:39 AM > > >> To: Jon Maloy <jon...@er...>; Parthasarathy Bhuvaragan > > >> <par...@er...>; tho...@gm... > > >> Cc: tip...@li... > > >> Subject: [net 0/5] solve two deadlock issues > > >> > > >> Commit d094c4d5f5 ("tipc: add subscription refcount to avoid invalid > > >> delete") accidently introduce the following deadlock scenarios: > > >> > > >> CPU1: CPU2: > > >> ---------- ---------------- > > >> tipc_nametbl_publish > > >> spin_lock_bh(&tn->nametbl_lock) > > >> tipc_nametbl_insert_publ > > >> tipc_nameseq_insert_publ > > >> tipc_subscrp_report_overlap > > >> tipc_subscrp_get > > >> tipc_subscrp_send_event > > >> tipc_close_conn > > >> tipc_subscrb_release_cb > > >> tipc_subscrb_delete > > >> tipc_subscrp_put > > >> tipc_subscrp_put > > >> tipc_subscrp_kref_release > > >> tipc_nametbl_unsubscribe > > >> spin_lock_bh(&tn->nametbl_lock) > > >> <<grab nametbl_lock again>> > > >> > > >> CPU1: CPU2: > > >> ---------- ---------------- > > >> tipc_nametbl_stop > > >> spin_lock_bh(&tn->nametbl_lock) > > >> tipc_purge_publications > > >> tipc_nameseq_remove_publ > > >> tipc_subscrp_report_overlap > > >> tipc_subscrp_get > > >> tipc_subscrp_send_event > > >> tipc_close_conn > > >> tipc_subscrb_release_cb > > >> tipc_subscrb_delete > > >> tipc_subscrp_put > > >> tipc_subscrp_put > > >> tipc_subscrp_kref_release > > >> tipc_nametbl_unsubscribe > > >> spin_lock_bh(&tn->nametbl_lock) > > >> <<grab nametbl_lock again>> > > >> > > >> The root cause of two deadlocks is that we have to hold nametbl lock > > >> when subscription is freed in tipc_subscrp_kref_release(). In order > > >> to eliminate the need of taking nametbl lock in > > >> tipc_subscrp_kref_release(), the functions protected by nametbl lock > > >> in tipc_subscrp_kref_release() are moved to other places step by step > in > > the series. > > >> > > >> Ying Xue (5): > > >> tipc: advance the time of deleting subscription from > > >> subscriber->subscrp_list > > >> tipc: adjust the policy of holding subscription kref > > >> tipc: adjust policy that sub->timer holds subscription kref > > >> tipc: advance the time of calling tipc_nametbl_unsubscribe > > >> tipc: remove unnecessary increasement of subscription refcount > > >> > > >> net/tipc/name_table.c | 2 ++ > > >> net/tipc/subscr.c | 32 ++++++++++++++------------------ > > >> net/tipc/subscr.h | 3 +++ > > >> 3 files changed, 19 insertions(+), 18 deletions(-) > > >> > > >> -- > > >> 2.7.4 > > > > > > > > > ---------------------------------------------------------------------- > > > -------- Check out the vibrant tech community on one of the world's > > > most engaging tech sites, SlashDot.org! http://sdm.link/slashdot > > > _______________________________________________ > > > tipc-discussion mailing list > > > tip...@li... > > > https://lists.sourceforge.net/lists/listinfo/tipc-discussion > > > > |
From: Ying X. <yin...@wi...> - 2017-02-27 10:03:56
|
On 02/23/2017 04:09 AM, John Thompson wrote: > My overnight testing has not shown any problems with this patch set. Thanks for the testing! Regards, Ying > JT > > On Thu, Feb 23, 2017 at 1:29 AM, Jon Maloy <jon...@er... > <mailto:jon...@er...>> wrote: > > Ok. So go for it to net-next. > Reviewed-by: Jon > ///jon > > > -----Original Message----- > > From: Ying Xue [mailto:yin...@wi... <mailto:yin...@wi...>] > > Sent: Wednesday, February 22, 2017 06:47 AM > > To: Jon Maloy <jon...@er... <mailto:jon...@er...>>; > Parthasarathy Bhuvaragan > > <par...@er... > <mailto:par...@er...>>; > tho...@gm... <mailto:tho...@gm...> > > Cc: tip...@li... > <mailto:tip...@li...> > > Subject: Re: [tipc-discussion] [net 0/5] solve two deadlock issues > > > > At least, net-next tree is still open as David is reviewing > patches submitted to > > net-next. > > > > Hope we have a window to submit the series to net-next. > > > > Thanks, > > Ying > > > > On 02/22/2017 07:42 PM, Xue, Ying wrote: > > > Hi Jon, > > > > > > I understood your concern. > > > > > > I have checked the possibility of merging patch #1, #4 and #5 as > one. > > However, just merging the three patch is insufficient, and at > least #2 seems > > necessary too, otherwise, another deadlock still exists due to two > premature > > 'return's in subcsrb_report_overlap(). Even if we merged them as > one, it will > > lose my initial purpose of dividing the series as so small > patches. Although > > each patch is made a small change, it's often related to a policy > adjustment of > > locking or holding refcount. Moreover, as our locking policy > associated with > > topserver becomes complex, I want to use the comments in each patch > > header to record what policy has been adjusted. In the future, the > > information can guide whether our changes comply with the adjusted > policy > > or not. > > > > > > In fact, all changes contained in the series is not big. But if > we merged them > > as one, all useful messages will be lost forever. > > > > > > Additionally, "net-next" tree reaches 4.10-rc8, and "net" tree > is 4.10-rc7 > > now. I saw today there was one developer who submitted a patch to net- > > next and David also accepted it. However, if John's testing proved > the series > > is okay tomorrow, probably I can send the series to net-next > tomorrow. Even > > for the worst case, we cannot submit the series until net-next is > open again. > > But I have checked nobody would maintain 4.10 as a stable version. > So even > > if there is a big long time gap, it seems not to cause a series issue. > > > > > > Regards, > > > Ying > > > > > > -----Original Message----- > > > From: Jon Maloy [mailto:jon...@er... > <mailto:jon...@er...>] > > > Sent: Tuesday, February 21, 2017 7:12 PM > > > To: Xue, Ying; Parthasarathy Bhuvaragan; tho...@gm... > <mailto:tho...@gm...> > > > Cc: tip...@li... > <mailto:tip...@li...> > > > Subject: RE: [net 0/5] solve two deadlock issues > > > > > > Hi Ying, > > > These are good design changes, that definitely should go in > asap. However, > > I feel deeply uncomfortable with such a big change going into > 'net', especially > > since our previous, exceptionally large, contribution now has > turned out to > > have problems. I wonder if we could not get away with something > simpler > > for 'net'. > > > > > > Looking closer at your series, it seems to me that only patches > ## 1, 4, and > > the lock removal part of #5 are really needed to solve the problem > we have > > at hand now. Why not merge those into one patch and deliver this > to 'net', > > while reference count redesign parts can go into net-next ? > > > > > > Regards > > > ///jon > > > > > > > > >> -----Original Message----- > > >> From: Ying Xue [mailto:yin...@wi... > <mailto:yin...@wi...>] > > >> Sent: Monday, February 20, 2017 06:39 AM > > >> To: Jon Maloy <jon...@er... > <mailto:jon...@er...>>; Parthasarathy Bhuvaragan > > >> <par...@er... > <mailto:par...@er...>>; > tho...@gm... <mailto:tho...@gm...> > > >> Cc: tip...@li... > <mailto:tip...@li...> > > >> Subject: [net 0/5] solve two deadlock issues > > >> > > >> Commit d094c4d5f5 ("tipc: add subscription refcount to avoid > invalid > > >> delete") accidently introduce the following deadlock scenarios: > > >> > > >> CPU1: CPU2: > > >> ---------- ---------------- > > >> tipc_nametbl_publish > > >> spin_lock_bh(&tn->nametbl_lock) > > >> tipc_nametbl_insert_publ > > >> tipc_nameseq_insert_publ > > >> tipc_subscrp_report_overlap > > >> tipc_subscrp_get > > >> tipc_subscrp_send_event > > >> tipc_close_conn > > >> tipc_subscrb_release_cb > > >> tipc_subscrb_delete > > >> tipc_subscrp_put > > >> tipc_subscrp_put > > >> tipc_subscrp_kref_release > > >> tipc_nametbl_unsubscribe > > >> spin_lock_bh(&tn->nametbl_lock) > > >> <<grab nametbl_lock again>> > > >> > > >> CPU1: CPU2: > > >> ---------- ---------------- > > >> tipc_nametbl_stop > > >> spin_lock_bh(&tn->nametbl_lock) > > >> tipc_purge_publications > > >> tipc_nameseq_remove_publ > > >> tipc_subscrp_report_overlap > > >> tipc_subscrp_get > > >> tipc_subscrp_send_event > > >> tipc_close_conn > > >> tipc_subscrb_release_cb > > >> tipc_subscrb_delete > > >> tipc_subscrp_put > > >> tipc_subscrp_put > > >> tipc_subscrp_kref_release > > >> tipc_nametbl_unsubscribe > > >> spin_lock_bh(&tn->nametbl_lock) > > >> <<grab nametbl_lock again>> > > >> > > >> The root cause of two deadlocks is that we have to hold nametbl > lock > > >> when subscription is freed in tipc_subscrp_kref_release(). In order > > >> to eliminate the need of taking nametbl lock in > > >> tipc_subscrp_kref_release(), the functions protected by nametbl > lock > > >> in tipc_subscrp_kref_release() are moved to other places step > by step in > > the series. > > >> > > >> Ying Xue (5): > > >> tipc: advance the time of deleting subscription from > > >> subscriber->subscrp_list > > >> tipc: adjust the policy of holding subscription kref > > >> tipc: adjust policy that sub->timer holds subscription kref > > >> tipc: advance the time of calling tipc_nametbl_unsubscribe > > >> tipc: remove unnecessary increasement of subscription refcount > > >> > > >> net/tipc/name_table.c | 2 ++ > > >> net/tipc/subscr.c | 32 ++++++++++++++------------------ > > >> net/tipc/subscr.h | 3 +++ > > >> 3 files changed, 19 insertions(+), 18 deletions(-) > > >> > > >> -- > > >> 2.7.4 > > > > > > > > > > ---------------------------------------------------------------------- > > > -------- Check out the vibrant tech community on one of the world's > > > most engaging tech sites, SlashDot.org! http://sdm.link/slashdot > > > _______________________________________________ > > > tipc-discussion mailing list > > > tip...@li... > <mailto:tip...@li...> > > > https://lists.sourceforge.net/lists/listinfo/tipc-discussion > <https://lists.sourceforge.net/lists/listinfo/tipc-discussion> > > > > > |
From: Parthasarathy B. <par...@er...> - 2017-02-23 12:10:58
|
Hi Ying, Thanks for the patches. I introduced these bugs and was sick for more than 2 weeks, so you had to fix them. I have few comments for the entire series. [tipc-discussion] [net 2/5] tipc: adjust the policy of holding subscription kref<https://sourceforge.net/p/tipc/mailman/message/35676908/><https://sourceforge.net/p/tipc/mailman/message/35676908/> The above commit looks good and should fix the issue reported by John. [tipc-discussion] [net 1/5] tipc: advance the time of deleting subscription from subscriber->subscrp_list<https://sourceforge.net/p/tipc/mailman/message/35676906/> In the above commit, we miss to delete sub->subscrp_list at subscription timeout. We placed this common code in refcount cleanup to avoid code duplication. [tipc-discussion] [net 3/5] tipc: adjust policy that sub->timer holds subscription kref<https://sourceforge.net/p/tipc/mailman/message/35676904/><https://sourceforge.net/p/tipc/mailman/message/35676904/> With this patch, we will never trigger a subscription cleanup at tipc_subscrp_timeout() as the refcount will be one when we exit tipc_subscrp_timeout(). Thus the subscription is still present in nametable and hence the subscriber will receive new events for this subscription after receiving the subscription timeout until the subscriber terminates. We need to update our design policy that at subscription timeout, we remove all references to the subscription and it is no longer accessible. This conflicts with the change proposed by the above commit. If we still need to include this commit, we need to adapt tipc_subscrp_timeout(), as: 1. tipc_nametbl_unsubscribe() 2. tipc_subscrp_put() 3. tipc_subscrp_put() static void tipc_subscrp_delete(struct tipc_subscription *sub) { u32 timeout = htohl(sub->evt.s.timeout, sub->swap); - if (timeout == TIPC_WAIT_FOREVER || del_timer(&sub->timer)) + if (timeout != TIPC_WAIT_FOREVER && del_timer(&sub->timer)) { tipc_subscrp_put(sub); + tipc_subscrp_put(sub); } else { + tipc_subscrp_put(sub); } } But this makes the functions look odd as we adjust refcount twice in the same function. To get the subscription timeout cleanup working correctly, we need to sqash this commit together :[tipc-discussion] [net 4/5] tipc: advance the time of calling tipc_nametbl_unsubscribe<https://sourceforge.net/p/tipc/mailman/message/35676905/> [tipc-discussion] [net 5/5] tipc: remove unnecessary increasement of subscription refcount<https://sourceforge.net/p/tipc/mailman/message/35676917/> In this commit message, you need to edit the stacktraces as the scenario you describe will not happen due to your previous patches. As we never perform a tipc_subscrp_get() in tipc_subscrp_report_overlap(). /Partha ________________________________ From: John Thompson <tho...@gm...> Sent: Tuesday, February 21, 2017 10:15 PM To: Ying Xue Cc: Jon Maloy; Parthasarathy Bhuvaragan; tip...@li... Subject: Re: [net 5/5] tipc: remove unnecessary increasement of subscription refcount Sorry, I was mistaken. You are right that you have removed the sub->lock from tipc_subscrp_kref_release() and so there is no chance of a deadlock. JT On Tue, Feb 21, 2017 at 11:22 PM, Ying Xue <yin...@wi...<mailto:yin...@wi...>> wrote: On 02/21/2017 06:13 AM, John Thompson wrote: > > > On Tue, Feb 21, 2017 at 12:39 AM, Ying Xue <yin...@wi...<mailto:yin...@wi...> > <mailto:yin...@wi...<mailto:yin...@wi...>>> wrote: > > As the policy of holding subscription in subscriber time has been > adjusted, it's unnecessary to hold subscription refcount before > tipc_subscrp_delete() is called. As a consequence, subscriber->lock > can be safely removed to avoid the following two deadlock scenarios: > > CPU1: CPU2: > ---------- ---------------- > tipc_nametbl_publish > spin_lock_bh(&tn->nametbl_lock) > tipc_nametbl_insert_publ > tipc_nameseq_insert_publ > tipc_subscrp_report_overlap > tipc_subscrp_get > tipc_subscrp_send_event > tipc_close_conn > tipc_subscrb_release_cb > tipc_subscrb_delete > tipc_subscrp_put > tipc_subscrp_put > tipc_subscrp_kref_release > tipc_nametbl_unsubscribe > spin_lock_bh(&tn->nametbl_lock) > <<grab nametbl_lock again>> > > CPU1: CPU2: > ---------- ---------------- > tipc_nametbl_stop > spin_lock_bh(&tn->nametbl_lock) > tipc_purge_publications > tipc_nameseq_remove_publ > tipc_subscrp_report_overlap > tipc_subscrp_get > tipc_subscrp_send_event > tipc_close_conn > tipc_subscrb_release_cb > tipc_subscrb_delete > tipc_subscrp_put > tipc_subscrp_put > tipc_subscrp_kref_release > tipc_nametbl_unsubscribe > spin_lock_bh(&tn->nametbl_lock) > <<grab nametbl_lock again>> > > Reported-by: John Thompson <tho...@gm...<mailto:tho...@gm...> > <mailto:tho...@gm...<mailto:tho...@gm...>>> > Reported-by: Jon Maloy <jon...@er...<mailto:jon...@er...> > <mailto:jon...@er...<mailto:jon...@er...>>> > Fixes: d094c4d5f5 ("tipc: add subscription refcount to avoid invalid > delete") > Signed-off-by: Ying Xue <yin...@wi...<mailto:yin...@wi...> > <mailto:yin...@wi...<mailto:yin...@wi...>>> > --- > net/tipc/subscr.c | 6 ------ > 1 file changed, 6 deletions(-) > > diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c > index 6624232..ea67cce 100644 > --- a/net/tipc/subscr.c > +++ b/net/tipc/subscr.c > @@ -168,9 +168,7 @@ static void tipc_subscrp_kref_release(struct > kref *kref) > struct tipc_net *tn = net_generic(sub->net, tipc_net_id); > struct tipc_subscriber *subscriber = sub->subscriber; > > - spin_lock_bh(&subscriber->lock); > atomic_dec(&tn->subscription_count); > - spin_unlock_bh(&subscriber->lock); > kfree(sub); > tipc_subscrb_put(subscriber); > } > @@ -202,11 +200,7 @@ static void tipc_subscrb_subscrp_delete(struct > tipc_subscriber *subscriber, > list_del(&sub->subscrp_list); > > tipc_nametbl_unsubscribe(sub); > - tipc_subscrp_get(sub); > - spin_unlock_bh(&subscriber->lock); > > > By removing the unlocking at this point couldn't you end up with a > deadlock on subscriber->lock > due to tipc_subscrp_delete() putting the subscrp twice (which could end > up with kref == 0) and > as a result calling tipc_subscrp_kref_release() which gets subscriber->lock? > I think there is no deadlock risk even if tipc_subscrp_put() will be called twice in tipc_subscrp_delete() because tipc_subscrp_kref_release() doesn't hold subscriber->lock after the change. Please take a look at tipc_subscrp_kref_release() code: static void tipc_subscrp_kref_release(struct kref *kref) { struct tipc_subscription *sub = container_of(kref, struct tipc_subscription, kref); struct tipc_net *tn = net_generic(sub->net, tipc_net_id); struct tipc_subscriber *subscriber = sub->subscriber; atomic_dec(&tn->subscription_count); kfree(sub); tipc_subscrb_put(subscriber); } > > > tipc_subscrp_delete(sub); > - tipc_subscrp_put(sub); > - spin_lock_bh(&subscriber->lock); > > if (s) > break; > -- > 2.7.4 > > |
From: Ying X. <yin...@wi...> - 2017-02-27 11:46:39
|
Hi Partha, it's very luck that you have a chance to review the series before they are submitted to mainline. Please take a look at my comments below. On 02/23/2017 08:10 PM, Parthasarathy Bhuvaragan wrote: > Hi Ying, > > > Thanks for the patches. I introduced these bugs and was sick for more > than 2 weeks, so you had to fix them. > > > I have few comments for the entire series. > > [tipc-discussion] [net 2/5] tipc: adjust the policy of holding > subscription kref > <https://sourceforge.net/p/tipc/mailman/message/35676908/>*<https://sourceforge.net/p/tipc/mailman/message/35676908/>* > > The above commit looks good and should fix the issue reported by John. > > > [tipc-discussion] [net 1/5] tipc: advance the time of deleting > subscription from subscriber->subscrp_list > <https://sourceforge.net/p/tipc/mailman/message/35676906/> > > In the above commit, we miss to delete sub->subscrp_list at subscription > timeout. We placed this common code in refcount cleanup to avoid code > duplication. > > > [tipc-discussion] [net 3/5] tipc: adjust policy that sub->timer holds > subscription kref > <https://sourceforge.net/p/tipc/mailman/message/35676904/>*<https://sourceforge.net/p/tipc/mailman/message/35676904/>* > > With this patch, we will never trigger a subscription cleanup at > tipc_subscrp_timeout() as the refcount will be one when we > exittipc_subscrp_timeout(). When I made the changes, in fact I was aware of the case, but I supposed that user should explicitly cancel subscription with TIPC_SUB_CANCEL to destroy subscription instance. However, I was unaware that new subscription events still happen e even if they are expired. I understood your proposal. But as you said, the approach looks very odd for us. However, I have another idea to a bit elegantly fix the issue while we don't need to make a big change for the whole series. Please wait for v2. Thanks, Ying > > Thus the subscription is still present in nametable and hence the > subscriber will receive new events for this subscription after receiving > the subscription timeout until the subscriber terminates. > > We need to update our design policy that at subscription timeout, we > remove all references to the subscription and it is no longer > accessible. This conflicts with the change proposed by the above commit. > > > If we still need to include this commit, we need to adapt > tipc_subscrp_timeout(), as: > > 1. tipc_nametbl_unsubscribe() > 2. tipc_subscrp_put() > 3. tipc_subscrp_put() > > static void tipc_subscrp_delete(struct tipc_subscription *sub) > { > u32 timeout = htohl(sub->evt.s.timeout, sub->swap); > > - if (timeout == TIPC_WAIT_FOREVER || del_timer(&sub->timer)) > + if (timeout != TIPC_WAIT_FOREVER && del_timer(&sub->timer)) { > tipc_subscrp_put(sub); > + tipc_subscrp_put(sub); > } else { > + tipc_subscrp_put(sub); > } > } > > But this makes the functions look odd as we adjust refcount twice in the > same function. To get the subscription timeout cleanup working > correctly, we need to sqash this commit together :*[tipc-discussion] > [net 4/5] tipc: advance the time of calling tipc_nametbl_unsubscribe > <https://sourceforge.net/p/tipc/mailman/message/35676905/>* > > *[tipc-discussion] [net 5/5] tipc: remove unnecessary increasement of > subscription refcount > <https://sourceforge.net/p/tipc/mailman/message/35676917/>* > > In this commit message, you need to edit the stacktraces as the scenario > you describe will not happen due to your previous patches. As we never > perform a tipc_subscrp_get() in tipc_subscrp_report_overlap(). > > > /Partha > ------------------------------------------------------------------------ > *From:* John Thompson <tho...@gm...> > *Sent:* Tuesday, February 21, 2017 10:15 PM > *To:* Ying Xue > *Cc:* Jon Maloy; Parthasarathy Bhuvaragan; > tip...@li... > *Subject:* Re: [net 5/5] tipc: remove unnecessary increasement of > subscription refcount > > Sorry, I was mistaken. You are right that you have removed the > sub->lock from > tipc_subscrp_kref_release() and so there is no chance of a deadlock. > > JT > > On Tue, Feb 21, 2017 at 11:22 PM, Ying Xue <yin...@wi... > <mailto:yin...@wi...>> wrote: > > On 02/21/2017 06:13 AM, John Thompson wrote: > > > > > > On Tue, Feb 21, 2017 at 12:39 AM, Ying Xue <yin...@wi... <mailto:yin...@wi...> > > <mailto:yin...@wi... <mailto:yin...@wi...>>> > wrote: > > > > As the policy of holding subscription in subscriber time has been > > adjusted, it's unnecessary to hold subscription refcount before > > tipc_subscrp_delete() is called. As a consequence, > subscriber->lock > > can be safely removed to avoid the following two deadlock > scenarios: > > > > CPU1: CPU2: > > ---------- ---------------- > > tipc_nametbl_publish > > spin_lock_bh(&tn->nametbl_lock) > > tipc_nametbl_insert_publ > > tipc_nameseq_insert_publ > > tipc_subscrp_report_overlap > > tipc_subscrp_get > > tipc_subscrp_send_event > > tipc_close_conn > > tipc_subscrb_release_cb > > tipc_subscrb_delete > > tipc_subscrp_put > > tipc_subscrp_put > > tipc_subscrp_kref_release > > tipc_nametbl_unsubscribe > > spin_lock_bh(&tn->nametbl_lock) > > <<grab nametbl_lock again>> > > > > CPU1: CPU2: > > ---------- ---------------- > > tipc_nametbl_stop > > spin_lock_bh(&tn->nametbl_lock) > > tipc_purge_publications > > tipc_nameseq_remove_publ > > tipc_subscrp_report_overlap > > tipc_subscrp_get > > tipc_subscrp_send_event > > tipc_close_conn > > tipc_subscrb_release_cb > > tipc_subscrb_delete > > tipc_subscrp_put > > tipc_subscrp_put > > tipc_subscrp_kref_release > > tipc_nametbl_unsubscribe > > spin_lock_bh(&tn->nametbl_lock) > > <<grab nametbl_lock again>> > > > > Reported-by: John Thompson <tho...@gm... > <mailto:tho...@gm...> > > <mailto:tho...@gm... <mailto:tho...@gm...>>> > > Reported-by: Jon Maloy <jon...@er... > <mailto:jon...@er...> > > <mailto:jon...@er... <mailto:jon...@er...>>> > > Fixes: d094c4d5f5 ("tipc: add subscription refcount to avoid invalid > > delete") > > Signed-off-by: Ying Xue <yin...@wi... <mailto:yin...@wi...> > > <mailto:yin...@wi... <mailto:yin...@wi...>>> > > --- > > net/tipc/subscr.c | 6 ------ > > 1 file changed, 6 deletions(-) > > > > diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c > > index 6624232..ea67cce 100644 > > --- a/net/tipc/subscr.c > > +++ b/net/tipc/subscr.c > > @@ -168,9 +168,7 @@ static void tipc_subscrp_kref_release(struct > > kref *kref) > > struct tipc_net *tn = net_generic(sub->net, tipc_net_id); > > struct tipc_subscriber *subscriber = sub->subscriber; > > > > - spin_lock_bh(&subscriber->lock); > > atomic_dec(&tn->subscription_count); > > - spin_unlock_bh(&subscriber->lock); > > kfree(sub); > > tipc_subscrb_put(subscriber); > > } > > @@ -202,11 +200,7 @@ static void > tipc_subscrb_subscrp_delete(struct > > tipc_subscriber *subscriber, > > list_del(&sub->subscrp_list); > > > > tipc_nametbl_unsubscribe(sub); > > - tipc_subscrp_get(sub); > > - spin_unlock_bh(&subscriber->lock); > > > > > > By removing the unlocking at this point couldn't you end up with a > > deadlock on subscriber->lock > > due to tipc_subscrp_delete() putting the subscrp twice (which > could end > > up with kref == 0) and > > as a result calling tipc_subscrp_kref_release() which gets > subscriber->lock? > > > > I think there is no deadlock risk even if tipc_subscrp_put() will be > called twice in tipc_subscrp_delete() because > tipc_subscrp_kref_release() doesn't hold subscriber->lock after the > change. > > Please take a look at tipc_subscrp_kref_release() code: > static void tipc_subscrp_kref_release(struct kref *kref) > { > struct tipc_subscription *sub = container_of(kref, > struct > tipc_subscription, > kref); > struct tipc_net *tn = net_generic(sub->net, tipc_net_id); > struct tipc_subscriber *subscriber = sub->subscriber; > > atomic_dec(&tn->subscription_count); > kfree(sub); > tipc_subscrb_put(subscriber); > } > > > > > > > tipc_subscrp_delete(sub); > > - tipc_subscrp_put(sub); > > - spin_lock_bh(&subscriber->lock); > > > > if (s) > > break; > > -- > > 2.7.4 > > > > > > |
From: Ying X. <yin...@wi...> - 2017-03-09 14:22:14
|
commit d094c4d5f5 ("tipc: add subscription refcount to avoid invalid delete") accidently introduce the following deadlock scenarios: CPU1: CPU2: ---------- ---------------- tipc_nametbl_publish spin_lock_bh(&tn->nametbl_lock) tipc_nametbl_insert_publ tipc_nameseq_insert_publ tipc_subscrp_report_overlap tipc_subscrp_get tipc_subscrp_send_event tipc_close_conn tipc_subscrb_release_cb tipc_subscrb_delete tipc_subscrp_put tipc_subscrp_put tipc_subscrp_kref_release tipc_nametbl_unsubscribe spin_lock_bh(&tn->nametbl_lock) <<grab nametbl_lock again>> CPU1: CPU2: ---------- ---------------- tipc_nametbl_stop spin_lock_bh(&tn->nametbl_lock) tipc_purge_publications tipc_nameseq_remove_publ tipc_subscrp_report_overlap tipc_subscrp_get tipc_subscrp_send_event tipc_close_conn tipc_subscrb_release_cb tipc_subscrb_delete tipc_subscrp_put tipc_subscrp_put tipc_subscrp_kref_release tipc_nametbl_unsubscribe spin_lock_bh(&tn->nametbl_lock) <<grab nametbl_lock again>> The root cause of two deadlocks is that we have to hold nametbl lock when subscription is freed in tipc_subscrp_kref_release(). In order to eliminate the need of taking nametbl lock in tipc_subscrp_kref_release(), the functions protected by nametbl lock in tipc_subscrp_kref_release() are moved to other places step by step in the series. Change log: v2: As Parth's comments, subscription is still present name table after it's expired. To fix it, we introduce a workqueue, and when subscription's timer is expired, the subscription will be pushed to the workqueue through its work. When the work is scheduled, the subscription be deleted finally. Ying Xue (6): tipc: advance the time of deleting subscription from subscriber->subscrp_list tipc: adjust the policy of holding subscription kref tipc: adjust policy that sub->timer holds subscription kref tipc: advance the time of calling tipc_nametbl_unsubscribe tipc: remove unnecessary increasement of subscription refcount tipc: delete expired subscription net/tipc/name_table.c | 2 ++ net/tipc/server.h | 2 ++ net/tipc/subscr.c | 64 ++++++++++++++++++++++++++++++++++++--------------- net/tipc/subscr.h | 4 ++++ 4 files changed, 54 insertions(+), 18 deletions(-) -- 2.7.4 |
From: Ying X. <yin...@wi...> - 2017-03-09 14:22:20
|
To make it more conventional to hold subscription refcount in timer, its policy is adjusted as follows: Before sub->timer is started, subscription refcount is held; when sub->timer is expired, subscription refcount will be decreased at the end of the timer timeout function; when the timer is stopped, refcount also needs to be decreased if the timer is still active. Signed-off-by: Ying Xue <yin...@wi...> Reviewed-by: Jon Maloy <jon...@er...> --- net/tipc/subscr.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 111d33c6..ffd7b9d 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -241,8 +241,10 @@ static void tipc_subscrp_delete(struct tipc_subscription *sub) { u32 timeout = htohl(sub->evt.s.timeout, sub->swap); - if (timeout == TIPC_WAIT_FOREVER || del_timer(&sub->timer)) + if (timeout != TIPC_WAIT_FOREVER && del_timer(&sub->timer)) tipc_subscrp_put(sub); + + tipc_subscrp_put(sub); } static void tipc_subscrp_cancel(struct tipc_subscr *s, @@ -303,16 +305,18 @@ static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s, spin_lock_bh(&subscriber->lock); list_add(&sub->subscrp_list, &subscriber->subscrp_list); + tipc_subscrb_get(subscriber); sub->subscriber = subscriber; + tipc_nametbl_subscribe(sub); - tipc_subscrb_get(subscriber); spin_unlock_bh(&subscriber->lock); setup_timer(&sub->timer, tipc_subscrp_timeout, (unsigned long)sub); timeout = htohl(sub->evt.s.timeout, swap); - if (timeout != TIPC_WAIT_FOREVER) - mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout)); + if ((timeout != TIPC_WAIT_FOREVER) && + !mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout))) + tipc_subscrp_get(sub); } /* Handle one termination request for the subscriber */ -- 2.7.4 |