accel-ppp-users Mailing List for accel-ppp (Page 3)
Status: Beta
Brought to you by:
xebd
You can subscribe to this list here.
2012 |
Jan
|
Feb
(2) |
Mar
(4) |
Apr
|
May
(3) |
Jun
(4) |
Jul
(9) |
Aug
(38) |
Sep
(24) |
Oct
(25) |
Nov
(2) |
Dec
(5) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2013 |
Jan
(20) |
Feb
(64) |
Mar
(58) |
Apr
(17) |
May
|
Jun
(29) |
Jul
(16) |
Aug
|
Sep
(1) |
Oct
(8) |
Nov
(1) |
Dec
|
2014 |
Jan
(25) |
Feb
(11) |
Mar
(15) |
Apr
(33) |
May
(2) |
Jun
(23) |
Jul
(3) |
Aug
(11) |
Sep
(12) |
Oct
(37) |
Nov
(8) |
Dec
|
2015 |
Jan
(10) |
Feb
(4) |
Mar
(2) |
Apr
(4) |
May
|
Jun
(3) |
Jul
(3) |
Aug
|
Sep
|
Oct
|
Nov
|
Dec
(12) |
2016 |
Jan
|
Feb
(1) |
Mar
(3) |
Apr
(18) |
May
(21) |
Jun
(28) |
Jul
(11) |
Aug
|
Sep
|
Oct
(1) |
Nov
(21) |
Dec
|
2017 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
(1) |
Sep
|
Oct
|
Nov
|
Dec
|
2018 |
Jan
(4) |
Feb
(6) |
Mar
(7) |
Apr
|
May
|
Jun
(13) |
Jul
(1) |
Aug
|
Sep
(3) |
Oct
(50) |
Nov
(21) |
Dec
(16) |
From: Guillaume N. <g....@al...> - 2018-10-24 14:36:10
|
We need to include <sys/socket.h> to define 'socklen_t', <sys/types.h> for 'ssize_t' and "list.h" for 'struct list_head'. Also, let's include "libnetlink.h" so that we don't need a forward declaration for 'struct rtnl_handle'. Signed-off-by: Guillaume Nault <g....@al...> --- accel-pppd/include/ap_net.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/accel-pppd/include/ap_net.h b/accel-pppd/include/ap_net.h index 25121d94..91ebdd5e 100644 --- a/accel-pppd/include/ap_net.h +++ b/accel-pppd/include/ap_net.h @@ -1,7 +1,11 @@ #ifndef __AP_NET_H #define __AP_NET_H -struct rtnl_handle; +#include <sys/socket.h> +#include <sys/types.h> + +#include "libnetlink.h" +#include "list.h" struct ap_net { struct list_head entry; -- 2.19.1 |
From: Guillaume N. <g....@al...> - 2018-10-24 14:36:10
|
We need to include <stdint.h> to define 'uint*_t' and <string.h> for 'memcpy'. Signed-off-by: Guillaume Nault <g....@al...> --- accel-pppd/include/connlimit.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/accel-pppd/include/connlimit.h b/accel-pppd/include/connlimit.h index 8711cf8b..2b2f0f4d 100644 --- a/accel-pppd/include/connlimit.h +++ b/accel-pppd/include/connlimit.h @@ -1,6 +1,9 @@ #ifndef __CONNLIMIT_H #define __CONNLIMIT_H +#include <stdint.h> +#include <string.h> + static inline uint64_t cl_key_from_ipv4(uint32_t ip) { return ip; -- 2.19.1 |
From: Guillaume N. <g....@al...> - 2018-10-24 14:36:09
|
We need to include <sys/types.h> to define 'size_t'. Also, let's include "ap_session.h" so that we don't need a forward declaration for 'struct ap_session'. Signed-off-by: Guillaume Nault <g....@al...> --- accel-pppd/backup/backup.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/accel-pppd/backup/backup.h b/accel-pppd/backup/backup.h index d36f70bd..106303e2 100644 --- a/accel-pppd/backup/backup.h +++ b/accel-pppd/backup/backup.h @@ -2,6 +2,9 @@ #define __BACKUP_H #include <stdint.h> +#include <sys/types.h> + +#include "ap_session.h" #include "list.h" #define MODID_COMMON 1 @@ -12,8 +15,6 @@ #define MODID_L2TP 6 #define MODID_IPPOOL 7 - -struct ap_session; struct backup_storage; struct backup_data; -- 2.19.1 |
From: Guillaume N. <g....@al...> - 2018-10-24 14:36:03
|
We need to include <netinet/in.h> to define 'struct in6_addr' and <stdint.h> for 'uint8_t'. Signed-off-by: Guillaume Nault <g....@al...> --- accel-pppd/include/ap_session_backup.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/accel-pppd/include/ap_session_backup.h b/accel-pppd/include/ap_session_backup.h index e6382ff5..9e175c85 100644 --- a/accel-pppd/include/ap_session_backup.h +++ b/accel-pppd/include/ap_session_backup.h @@ -1,6 +1,9 @@ #ifndef __AP_SESSION_BACKUP_H #define __AP_SESSION_BACKUP_H +#include <netinet/in.h> +#include <stdint.h> + #define SES_TAG_USERNAME 1 #define SES_TAG_SESSIONID 2 #define SES_TAG_START_TIME 3 -- 2.19.1 |
From: Guillaume N. <g....@al...> - 2018-10-24 14:36:01
|
This patch series fixes header files found under include/ so that they include their own dependencies. This way, one can include any file from include/ without having to manually figure out the dependencies. Each of the include/ header files has been tested for correct compilation when included in an otherwise empty .c file. Guillaume Nault (16): backup: make ap_session_backup.h self-contained backup: make backup.h self-contained connlimit: make connlimit.h self-contained core: make ap_net.h self-contained iputils: make iputils.h self-contained libnetlink: make ipset.h self-contained libnetlink: make libnetlink.h self-contained lua: make luasupp.h self-contained memdebug: make memdebug.h self-contained even when MEMDEBUG isn't defined ppp: make ppp_lcp.h self-contained ppp: make ppp_auth.h self-contained ppp: make ppp_fsm.h self-contained radius: make radius.h self-contained triton: make mempool.h self-contained even when MEMDEBUG is defined vlan_mon: make vlan_mon.h self-contained cli: fix include directive for list.h accel-pppd/backup/backup.h | 5 +++-- accel-pppd/cli/cli.h | 3 ++- accel-pppd/include/ap_net.h | 6 +++++- accel-pppd/include/ap_session_backup.h | 3 +++ accel-pppd/include/connlimit.h | 3 +++ accel-pppd/include/vlan_mon.h | 2 ++ accel-pppd/libnetlink/ipset.h | 2 ++ accel-pppd/libnetlink/iputils.h | 2 ++ accel-pppd/libnetlink/libnetlink.h | 1 + accel-pppd/lua/luasupp.h | 2 ++ accel-pppd/memdebug.h | 5 +++++ accel-pppd/ppp/ppp_auth.h | 3 +++ accel-pppd/ppp/ppp_fsm.h | 7 +++++-- accel-pppd/ppp/ppp_lcp.h | 2 ++ accel-pppd/radius/radius.h | 6 ++++-- accel-pppd/triton/mempool.h | 2 ++ 16 files changed, 46 insertions(+), 8 deletions(-) -- 2.19.1 |
From: Dmitry K. <xe...@ma...> - 2018-10-23 07:27:48
|
>Several modules assume that if ses->ipv6 is set, then >ses->ipv6->addr_list contains at least one element. But this is not >true if ipv6 was allocated by the pseudo ipdb backend of ipv6cp >(ipv6cp_opt_intfid.c). That is, if the PPP session only has an >automatic link-local address. > >This leads modules like pppd-compat and dhcpv6 to access invalid memory >when trying to retrieve the IPv6 address of a PPP session. > >Signed-off-by: Guillaume Nault < g....@al... > >--- >This patch probably fixes github's issue #49 >https://github.com/xebd/accel-ppp/issues/49 > >Also, we could alternatively remove the ipv6cp ipdb backend entirely. >But there would be side effects. Currently, if no IPv6 is defined (by >radius or ipv6db), we can still negotiate a link-local address, then >define a routable IPv6 address statically with pppd_compat. Without the >pseudo ipdb backend, accel-ppp would reject IPv6CP entirely in such >cases. > > accel-pppd/cli/show_sessions.c | 2 +- > accel-pppd/extra/pppd_compat.c | 2 +- > accel-pppd/ipv6/dhcpv6.c | 2 +- > accel-pppd/lua/session.c | 2 +- > 4 files changed, 4 insertions(+), 4 deletions(-) Applied -- Dmitry Kozlov |
From: Dmitry K. <xe...@ma...> - 2018-10-23 06:46:17
|
>Define a new column, called "netns", that prints the network namespace >in which sessions are set. > >Signed-off-by: Guillaume Nault < g....@al... > >--- > accel-pppd/cli/show_sessions.c | 6 ++++++ > 1 file changed, 6 insertions(+) Applied, thanks. -- Dmitry Kozlov |
From: Guillaume N. <g....@al...> - 2018-10-22 10:29:41
|
On Sat, Oct 20, 2018 at 10:55:54AM +0300, Dmitry Kozlov wrote: > >On Tue, Oct 09, 2018 at 07:45:48PM +0200, Guillaume Nault wrote: > >> On Fri, Sep 21, 2018 at 02:16:17PM +0200, Guillaume Nault wrote: > >> > Several modules assume that if ses->ipv6 is set, then > >> > ses->ipv6->addr_list contains at least one element. But this is not > >> > true if ipv6 was allocated by the pseudo ipdb backend of ipv6cp > >> > (ipv6cp_opt_intfid.c). That is, if the PPP session only has an > >> > automatic link-local address. > >> > > >> I had no news for this patch. Is there anything wrong with it? > >> > >BTW, this patch probably fixes github's issue #49: > >https://github.com/xebd/accel-ppp/issues/49 > > > >Guillaume > > didn't get this patch too > Ok, I've just resent it, together with another patch for which I had no news (netns column in "show sessions"). |
From: Guillaume N. <g....@al...> - 2018-10-22 10:25:23
|
Define a new column, called "netns", that prints the network namespace in which sessions are set. Signed-off-by: Guillaume Nault <g....@al...> --- accel-pppd/cli/show_sessions.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/accel-pppd/cli/show_sessions.c b/accel-pppd/cli/show_sessions.c index 443b186a..a96a31d8 100644 --- a/accel-pppd/cli/show_sessions.c +++ b/accel-pppd/cli/show_sessions.c @@ -379,6 +379,11 @@ early_out: goto out; } +static void print_netns(struct ap_session *ses, char *buf) +{ + snprintf(buf, CELL_SIZE, "%s", ses->net->name); +} + static void print_ifname(struct ap_session *ses, char *buf) { snprintf(buf, CELL_SIZE, "%s", ses->ifname); @@ -633,6 +638,7 @@ static void init(void) cli_register_simple_cmd2(show_ses_exec, show_ses_help, 2, "show", "sessions"); + cli_show_ses_register("netns", "network namespace name", print_netns); cli_show_ses_register("ifname", "interface name", print_ifname); cli_show_ses_register("username", "user name", print_username); cli_show_ses_register("ip", "IP address", print_ip); -- 2.19.1 |
From: Guillaume N. <g....@al...> - 2018-10-22 10:24:17
|
Several modules assume that if ses->ipv6 is set, then ses->ipv6->addr_list contains at least one element. But this is not true if ipv6 was allocated by the pseudo ipdb backend of ipv6cp (ipv6cp_opt_intfid.c). That is, if the PPP session only has an automatic link-local address. This leads modules like pppd-compat and dhcpv6 to access invalid memory when trying to retrieve the IPv6 address of a PPP session. Signed-off-by: Guillaume Nault <g....@al...> --- This patch probably fixes github's issue #49 https://github.com/xebd/accel-ppp/issues/49 Also, we could alternatively remove the ipv6cp ipdb backend entirely. But there would be side effects. Currently, if no IPv6 is defined (by radius or ipv6db), we can still negotiate a link-local address, then define a routable IPv6 address statically with pppd_compat. Without the pseudo ipdb backend, accel-ppp would reject IPv6CP entirely in such cases. accel-pppd/cli/show_sessions.c | 2 +- accel-pppd/extra/pppd_compat.c | 2 +- accel-pppd/ipv6/dhcpv6.c | 2 +- accel-pppd/lua/session.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/accel-pppd/cli/show_sessions.c b/accel-pppd/cli/show_sessions.c index 443b186a..c725e05d 100644 --- a/accel-pppd/cli/show_sessions.c +++ b/accel-pppd/cli/show_sessions.c @@ -414,7 +414,7 @@ static void print_ip6(struct ap_session *ses, char *buf) struct in6_addr addr; char *ptr; - if (!ses->ipv6) { + if (!ses->ipv6 || list_empty(&ses->ipv6->addr_list)) { *buf = 0; return; } diff --git a/accel-pppd/extra/pppd_compat.c b/accel-pppd/extra/pppd_compat.c index 318327e8..7c23eca0 100644 --- a/accel-pppd/extra/pppd_compat.c +++ b/accel-pppd/extra/pppd_compat.c @@ -622,7 +622,7 @@ static void fill_env(char **env, char *mem, struct pppd_compat_pd *pd) mem += write_sz + 1; ++n; - if (ses->ipv6) { + if (ses->ipv6 && !list_empty(&ses->ipv6->addr_list)) { ///FIXME only first address is passed to env struct ipv6db_addr_t *a = list_first_entry(&ses->ipv6->addr_list, typeof(*a), entry); diff --git a/accel-pppd/ipv6/dhcpv6.c b/accel-pppd/ipv6/dhcpv6.c index 84d36caa..36a5f15e 100644 --- a/accel-pppd/ipv6/dhcpv6.c +++ b/accel-pppd/ipv6/dhcpv6.c @@ -69,7 +69,7 @@ static void ev_ses_started(struct ap_session *ses) int sock; int f = 1; - if (!ses->ipv6) + if (!ses->ipv6 || list_empty(&ses->ipv6->addr_list)) return; a = list_entry(ses->ipv6->addr_list.next, typeof(*a), entry); diff --git a/accel-pppd/lua/session.c b/accel-pppd/lua/session.c index 277b299f..bd98911b 100644 --- a/accel-pppd/lua/session.c +++ b/accel-pppd/lua/session.c @@ -197,7 +197,7 @@ static int session_ipv6(lua_State *L) if (!ses) return 0; - if (ses->ipv6) { + if (ses->ipv6 && !list_empty(&ses->ipv6->addr_list)) { a = list_entry(ses->ipv6->addr_list.next, typeof(*a), entry); if (a->prefix_len) { build_ip6_addr(a, ses->ipv6->peer_intf_id, &addr); -- 2.19.1 |
From: Guillaume N. <g....@al...> - 2018-10-22 10:13:39
|
On Sat, Oct 20, 2018 at 10:45:30AM +0300, Dmitry Kozlov wrote: > >May I get some feedbacks for this series? We can discuss other solutions, > >but I feel these issues are quite important to resolve. > > > >Regards, > > > >Guillaume > > Hi, > I didn't get these patches, please try to resend > Ok, now resent (with the alternative fix too). Thanks Dmitry. |
From: Guillaume N. <g....@al...> - 2018-10-22 10:09:08
|
On Mon, Oct 22, 2018 at 12:00:00PM +0200, Guillaume Nault wrote: > The problem is that there is no synchronisation between the parent and > its child. When under stress, the child may execute faster than its > parent and the sigchld callback might run triton_context_wakeup() > before the parent had time to call triton_context_schedule(). > > Then accel-ppp might crash because the triton thread might have reset > ctx->thread to NULL, making triton_context_wakeup() write to invalid > memory when trying to insert the context in ctx->thread->wakeup_list[]. > > Synchronising the parent and its child completion's callback would > require cooperation from triton_context_schedule(). Otherwise we would > still have a time frame between the moment we let the callback waking > up the context and the moment we put the context in sleep mode. > Allowing schedule/wakeup call inversion in triton looks simpler since > it avoids modifying the current API. > Here's what this alternative approach would look like: Let triton_context_schedule() take a functionn pointer as parameter and call it just before jumping back to the triton_thread. Then we have a race-free way to unlock the sigchld module in pppd-compat. -------- 8< -------- diff --git a/accel-pppd/extra/pppd_compat.c b/accel-pppd/extra/pppd_compat.c index 318327e8..0d3ac031 100644 --- a/accel-pppd/extra/pppd_compat.c +++ b/accel-pppd/extra/pppd_compat.c @@ -252,9 +252,8 @@ static void ev_ses_pre_up(struct ap_session *ses) sigchld_register_handler(&pd->hnd); if (conf_verbose) log_ppp_info2("pppd_compat: ip-pre-up started (pid %i)\n", pid); - sigchld_unlock(); - triton_context_schedule(); + triton_context_schedule_after(sigchld_unlock); pthread_mutex_lock(&pd->hnd.lock); pthread_mutex_unlock(&pd->hnd.lock); @@ -368,9 +367,8 @@ static void ev_ses_finished(struct ap_session *ses) sigchld_register_handler(&pd->hnd); if (conf_verbose) log_ppp_info2("pppd_compat: ip-down started (pid %i)\n", pid); - sigchld_unlock(); - triton_context_schedule(); + triton_context_schedule_after(sigchld_unlock); pthread_mutex_lock(&pd->hnd.lock); pthread_mutex_unlock(&pd->hnd.lock); diff --git a/accel-pppd/triton/triton.c b/accel-pppd/triton/triton.c index d5bd01d3..2e8fb4ad 100644 --- a/accel-pppd/triton/triton.c +++ b/accel-pppd/triton/triton.c @@ -486,7 +486,7 @@ static ucontext_t * __attribute__((noinline)) alloc_context() return uc; } -void __export triton_context_schedule() +void __export triton_context_schedule_after(void (*func)(void)) { volatile struct _triton_context_t *ctx = (struct _triton_context_t *)this_ctx->tpd; @@ -512,6 +512,8 @@ void __export triton_context_schedule() } else { ctx->thread->ctx = NULL; spin_unlock(&threads_lock); + if (func) + func(); longjmp(jmp_env, 1); } } diff --git a/accel-pppd/triton/triton.h b/accel-pppd/triton/triton.h index 79bbee1b..2b2bf460 100644 --- a/accel-pppd/triton/triton.h +++ b/accel-pppd/triton/triton.h @@ -74,12 +74,17 @@ extern struct triton_stat_t triton_stat; int triton_context_register(struct triton_context_t *, void *arg); void triton_context_unregister(struct triton_context_t *); void triton_context_set_priority(struct triton_context_t *, int); -void triton_context_schedule(void); +void triton_context_schedule_after(void (*func)(void)); void triton_context_wakeup(struct triton_context_t *); int triton_context_call(struct triton_context_t *, void (*func)(void *), void *arg); void triton_cancel_call(struct triton_context_t *, void (*func)(void *)); struct triton_context_t *triton_context_self(void); +static inline void triton_context_schedule(void) +{ + triton_context_schedule_after(NULL); +} + #define MD_MODE_READ 1 #define MD_MODE_WRITE 2 |
From: Guillaume N. <g....@al...> - 2018-10-22 10:00:27
|
When accel-ppp is under stress (for example because of massive disconnections) it may enter a state where no session could be created or destroyed anymore. This happens when at least one of the pppd_compat fork() fail. In this case, the error code path doesn't unlock the sigchld handler, which prevents it from running the completion callbacks of running scripts. If the "fork-limit" option is used, failure to call the completion callback will prevent other scripts from running. This will block setting up and tearing down sessions, as those will wait indefinitely for their pppd_compat scripts to run. Therefore, we have to unlock the sigchld handler when fork() fails. We also need to call fork_queue_wakeup(), because the previous check_fork_limit() call already took one reference in the fork limit. Finally, ev_ses_pre_up() is a bit special because it has to tear the session down if the ip-pre-up script failed. Therefore it also has to call ap_session_terminate() upon fork() failures. Signed-off-by: Guillaume Nault <g....@al...> --- accel-pppd/extra/pppd_compat.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/accel-pppd/extra/pppd_compat.c b/accel-pppd/extra/pppd_compat.c index 318327e8..cf18af4c 100644 --- a/accel-pppd/extra/pppd_compat.c +++ b/accel-pppd/extra/pppd_compat.c @@ -275,8 +275,12 @@ static void ev_ses_pre_up(struct ap_session *ses) log_emerg("pppd_compat: exec '%s': %s\n", conf_ip_pre_up, strerror(errno)); _exit(EXIT_FAILURE); - } else + } else { + sigchld_unlock(); + fork_queue_wakeup(); log_error("pppd_compat: fork: %s\n", strerror(errno)); + ap_session_terminate(ses, TERM_NAS_ERROR, 0); + } } static void ev_ses_started(struct ap_session *ses) @@ -325,8 +329,11 @@ static void ev_ses_started(struct ap_session *ses) log_emerg("pppd_compat: exec '%s': %s\n", conf_ip_up, strerror(errno)); _exit(EXIT_FAILURE); - } else + } else { + sigchld_unlock(); + fork_queue_wakeup(); log_error("pppd_compat: fork: %s\n", strerror(errno)); + } } static void ev_ses_finished(struct ap_session *ses) @@ -385,8 +392,11 @@ static void ev_ses_finished(struct ap_session *ses) log_emerg("pppd_compat: exec '%s': %s\n", conf_ip_down, strerror(errno)); _exit(EXIT_FAILURE); - } else + } else { + sigchld_unlock(); + fork_queue_wakeup(); log_error("pppd_compat: fork: %s\n", strerror(errno)); + } } if (pd->ip_up_hnd.pid) { @@ -474,8 +484,11 @@ static void ev_radius_coa(struct ev_radius_t *ev) log_emerg("pppd_compat: exec '%s': %s\n", conf_ip_change, strerror(errno)); _exit(EXIT_FAILURE); - } else + } else { + sigchld_unlock(); + fork_queue_wakeup(); log_error("pppd_compat: fork: %s\n", strerror(errno)); + } } } -- 2.19.1 |
From: Guillaume N. <g....@al...> - 2018-10-22 10:00:18
|
Allow triton_context_wakeup() to run before triton_context_schedule(). When that happens, triton_context_schedule() now lets the context running instead of putting it in sleep mode. Note that, even though triton now allows triton_context_wakeup() to happen before triton_context_schedule(), these two functions still need to be paired and not nested. That is, in a sequence like the following, triton_context_wakeup() triton_context_wakeup() triton_context_schedule() triton_context_schedule() the second triton_context_schedule() would put the context in sleep mode. No matter how many triton_context_wakeup() have been called, the first triton_context_schedule() "consumes" them all. Being immune to schedule/wakeup inversion allows to fix the pppd_compat module. This module needs to fork() to execute external programs. The parent then waits for completion of its child using triton_context_schedule(). When child terminates, the sigchld module runs a callback that has to call triton_context_wakeup() to resume execution of the parent. The problem is that there is no synchronisation between the parent and its child. When under stress, the child may execute faster than its parent and the sigchld callback might run triton_context_wakeup() before the parent had time to call triton_context_schedule(). Then accel-ppp might crash because the triton thread might have reset ctx->thread to NULL, making triton_context_wakeup() write to invalid memory when trying to insert the context in ctx->thread->wakeup_list[]. Synchronising the parent and its child completion's callback would require cooperation from triton_context_schedule(). Otherwise we would still have a time frame between the moment we let the callback waking up the context and the moment we put the context in sleep mode. Allowing schedule/wakeup call inversion in triton looks simpler since it avoids modifying the current API. Signed-off-by: Guillaume Nault <g....@al...> --- accel-pppd/triton/triton.c | 18 +++++++++++++++--- accel-pppd/triton/triton_p.h | 1 + 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/accel-pppd/triton/triton.c b/accel-pppd/triton/triton.c index d5bd01d3..3ea72f02 100644 --- a/accel-pppd/triton/triton.c +++ b/accel-pppd/triton/triton.c @@ -125,7 +125,7 @@ static void* triton_thread(struct _triton_thread_t *thread) while (1) { spin_lock(&threads_lock); if (!need_config_reload && check_ctx_queue_empty(thread)) { - if (thread->ctx->wakeup) { + if (thread->ctx->asleep && thread->ctx->wakeup) { log_debug2("thread: %p: wakeup ctx %p\n", thread, thread->ctx); list_del(&thread->ctx->entry2); spin_unlock(&threads_lock); @@ -503,6 +503,7 @@ void __export triton_context_schedule() spin_lock(&threads_lock); if (ctx->wakeup) { + ctx->asleep = 0; ctx->wakeup = 0; spin_unlock(&threads_lock); _free(ctx->uc); @@ -510,6 +511,7 @@ void __export triton_context_schedule() __sync_sub_and_fetch(&triton_stat.context_sleeping, 1); log_debug2("ctx %p: exit schedule\n", ctx); } else { + ctx->asleep = 1; ctx->thread->ctx = NULL; spin_unlock(&threads_lock); longjmp(jmp_env, 1); @@ -532,9 +534,19 @@ void __export triton_context_wakeup(struct triton_context_t *ud) spin_unlock(&ctx->lock); } else { spin_lock(&threads_lock); + /* In some cases (pppd_compat.c), triton_context_wakeup() might + * be called before triton_context_schedule(). When that + * happens, we must not add 'ctx' to the wakeup_list as it is + * still awake. However we need to set the 'wakeup' flag. This + * way, when triton_context_schedule() will run, it will + * realise that triton_context_wakeup() was already executed + * and will avoid putting 'ctx' in sleep mode. + */ ctx->wakeup = 1; - list_add_tail(&ctx->entry2, &ctx->thread->wakeup_list[ctx->priority]); - r = ctx->thread->ctx == NULL; + if (ctx->asleep) { + list_add_tail(&ctx->entry2, &ctx->thread->wakeup_list[ctx->priority]); + r = ctx->thread->ctx == NULL; + } spin_unlock(&threads_lock); } diff --git a/accel-pppd/triton/triton_p.h b/accel-pppd/triton/triton_p.h index fe735bf6..a727e62e 100644 --- a/accel-pppd/triton/triton_p.h +++ b/accel-pppd/triton/triton_p.h @@ -40,6 +40,7 @@ struct _triton_context_t int init; int queued; int wakeup; + int asleep; int need_close; int need_free; int pending; -- 2.19.1 |
From: Guillaume N. <g....@al...> - 2018-10-22 10:00:15
|
This series fixes two problems that can happen when many pppd_compat scripts need to run at the same moment. Patch 1 is a bit tricky. If fixes a race condition that happens inside triton when triton_context_wakeup() is run before the corresponding triton_context_schedule(). As explained in the patch, pppd_compat has no way to ensure proper ordering of these functions with the current triton API. So this patch modifies triton to explicitly handle the case where triton_context_wakeup() is called before triton_context_schedule(). An alternative fix would be to extend triton_context_schedule() so that it could run an arbitrary function and yield atomically. I will post this alternative fix too, so that one can compare both approaches. Patch 2 fixes a deadlock that happens when fork() fails. The failed attempt was not removed from the fork counter, preventing other scripts from being executed when the "fork-limit" option was set. Guillaume Nault (2): triton: fix context schedule/wakeup race pppd_compat: fix handling of fork() failures accel-pppd/extra/pppd_compat.c | 21 +++++++++++++++++---- accel-pppd/triton/triton.c | 18 +++++++++++++++--- accel-pppd/triton/triton_p.h | 1 + 3 files changed, 33 insertions(+), 7 deletions(-) -- 2.19.1 |
From: Dmitry K. <xe...@ma...> - 2018-10-20 07:56:07
|
>On Tue, Oct 09, 2018 at 07:45:48PM +0200, Guillaume Nault wrote: >> On Fri, Sep 21, 2018 at 02:16:17PM +0200, Guillaume Nault wrote: >> > Several modules assume that if ses->ipv6 is set, then >> > ses->ipv6->addr_list contains at least one element. But this is not >> > true if ipv6 was allocated by the pseudo ipdb backend of ipv6cp >> > (ipv6cp_opt_intfid.c). That is, if the PPP session only has an >> > automatic link-local address. >> > >> I had no news for this patch. Is there anything wrong with it? >> >BTW, this patch probably fixes github's issue #49: >https://github.com/xebd/accel-ppp/issues/49 > >Guillaume didn't get this patch too -- Dmitry Kozlov |
From: Dmitry K. <xe...@ma...> - 2018-10-20 07:45:41
|
>May I get some feedbacks for this series? We can discuss other solutions, >but I feel these issues are quite important to resolve. > >Regards, > >Guillaume Hi, I didn't get these patches, please try to resend -- Dmitry Kozlov |
From: Guillaume N. <g....@al...> - 2018-10-19 09:49:53
|
On Tue, Oct 09, 2018 at 07:45:48PM +0200, Guillaume Nault wrote: > On Fri, Sep 21, 2018 at 02:16:17PM +0200, Guillaume Nault wrote: > > Several modules assume that if ses->ipv6 is set, then > > ses->ipv6->addr_list contains at least one element. But this is not > > true if ipv6 was allocated by the pseudo ipdb backend of ipv6cp > > (ipv6cp_opt_intfid.c). That is, if the PPP session only has an > > automatic link-local address. > > > I had no news for this patch. Is there anything wrong with it? > BTW, this patch probably fixes github's issue #49: https://github.com/xebd/accel-ppp/issues/49 Guillaume |
From: Guillaume N. <g....@al...> - 2018-10-19 09:46:48
|
On Tue, Oct 09, 2018 at 06:58:29PM +0200, Guillaume Nault wrote: > This series fixes two problems that can happen when many pppd_compat > scripts need to run at the same moment. > > > Patch 1 is a bit tricky. If fixes a race condition that happens inside > triton when triton_context_wakeup() is run before the corresponding > triton_context_schedule(). As explained in the patch, pppd_compat has > no way to ensure proper ordering of these functions with the current > triton API. So this patch modifies triton to explicitly handle the > case where triton_context_wakeup() is called before > triton_context_schedule(). > > An alternative fix would be to extend triton_context_schedule() so that > it can run an arbitrary function and yield atomically. I will post this > alternative fix too, so that one can compare both approaches. > > > Patch 2 fixes a deadlock that happens when fork() fails. The failed > attempt was not removed from the fork counter, preventing other scripts > from being executed when the "fork-limit" option was set. > Hi, May I get some feedbacks for this series? We can discuss other solutions, but I feel these issues are quite important to resolve. Regards, Guillaume |
From: Guillaume N. <g....@al...> - 2018-10-18 10:18:12
|
On Sun, Oct 14, 2018 at 10:18:42PM +0200, Alarig Le Lay wrote: > On ven. 12 oct. 17:21:01 2018, Guillaume Nault wrote: > > There are a few places where the bug might come from. Either the host > > doesn't tell the driver that checksum isn't fully computed, or the > > driver advertises checksum offload capabilities without actually > > implementing it. > > If you use virtualisation, the virtual NIC might advertise offload > > support and rely on the physical NIC driver to actually perform the > > computation. If the physical NIC doesn't offer this feature, the > > virtual NIC should provide software fallback. > > I suppose there is a bug somewhere in this chain. > > > > So I imagine something changed in your setup wrt. checksum offload, > > either on the LNS or the hypervisor. > > The checksums seem to be enabled on both LNS: > > judicael-adsl ~ # ethtool -k eth0 | grep tx-checksum > tx-checksumming: on > tx-checksum-ipv4: off [fixed] > tx-checksum-ip-generic: on > tx-checksum-ipv6: off [fixed] > tx-checksum-fcoe-crc: off [fixed] > tx-checksum-sctp: off [fixed] > > lns02 ~ # ethtool -k eth2 | grep tx-checksum > tx-checksumming: on > tx-checksum-ipv4: off [fixed] > tx-checksum-ip-generic: on > tx-checksum-ipv6: off [fixed] > tx-checksum-fcoe-crc: off [fixed] > tx-checksum-sctp: off [fixed] > > But, as soon as I re-enable checksumming on the BGP router, I see the > same kind of messages in the logs. > > root@bgp-adsl:~ # ifconfig vtnet2 rxcsum txcsum tso lro > > lns02 ~ # tail -F /var/log/daemon.log > […] > Oct 14 22:00:42 lns02 accel-pppd: l2tp tunnel 347-46302 (78.41.184.80:1701): retransmission #1 > Oct 14 22:00:42 lns02 accel-pppd: l2tp tunnel 347-46302 (78.41.184.80:1701): retransmit (timeout) [L2TP tid=46302 sid=0 <Message-Type Hello>] > Oct 14 22:00:44 lns02 accel-pppd: l2tp tunnel 347-46302 (78.41.184.80:1701): retransmission #2 > Oct 14 22:00:44 lns02 accel-pppd: l2tp tunnel 347-46302 (78.41.184.80:1701): retransmit (timeout) [L2TP tid=46302 sid=0 <Message-Type Hello>] > Oct 14 22:00:48 lns02 accel-pppd: l2tp tunnel 347-46302 (78.41.184.80:1701): retransmission #3 > Oct 14 22:00:48 lns02 accel-pppd: l2tp tunnel 347-46302 (78.41.184.80:1701): retransmit (timeout) [L2TP tid=46302 sid=0 <Message-Type Hello>] > Oct 14 22:00:56 lns02 accel-pppd: l2tp tunnel 347-46302 (78.41.184.80:1701): retransmission #4 > Oct 14 22:00:56 lns02 accel-pppd: l2tp tunnel 347-46302 (78.41.184.80:1701): retransmit (timeout) [L2TP tid=46302 sid=0 <Message-Type Hello>] > Oct 14 22:01:12 lns02 accel-pppd: l2tp tunnel 347-46302 (78.41.184.80:1701): retransmission #5 > Oct 14 22:01:12 lns02 accel-pppd: l2tp tunnel 347-46302 (78.41.184.80:1701): retransmit (timeout) [L2TP tid=46302 sid=0 <Message-Type Hello>] > Oct 14 22:01:28 lns02 accel-pppd: l2tp tunnel 347-46302 (78.41.184.80:1701): no acknowledgement from peer after 5 retransmissions, deleting tunnel > Oct 14 22:01:28 lns02 accel-pppd: l2tp tunnel 347-46302 (78.41.184.80:1701): deleting tunnel > Oct 14 22:01:28 lns02 accel-pppd: l2tp session 347-46302, 61625-32197: deleting session > Oct 14 22:01:28 lns02 accel-pppd: l2tp session 347-46302, 61625-32197: deleting data channel > Oct 14 22:01:28 lns02 accel-pppd: l2tp session 347-46302, 45186-40850: deleting session > Oct 14 22:01:28 lns02 accel-pppd: l2tp session 347-46302, 45186-40850: deleting data channel > Oct 14 22:01:28 lns02 accel-pppd: ppp0:qu...@gr...el: send [RADIUS(1) Accounting-Request id=1 <User-Name "qu...@gr...el"> <NAS-Identifier "accel-ppp"> <NAS-IP-Address 89.234.186.205> <NAS-Port 0> <NAS-Port-Id "ppp0"> <NAS-Port-Type Virtual> <Service-Type Framed-User> <Framed-Protocol PPP> <Calling-Station-Id "78.41.184.80"> <Called-Station-Id "89.234.186.12"> <Acct-Status-Type Stop> <Acct-Authentic RADIUS> <Acct-Session-Id "0dbb447ec6fa5cbf"> <Acct-Session-Time 73568> <Acct-Input-Octets 365708680> <Acct-Output-Octets 1036838156> <Acct-Input-Packets 2436312> <Acct-Output-Packets 4425180> <Acct-Input-Gigawords 0> <Acct-Output-Gigawords 1> <Framed-IP-Address 89.234.186.38> <Framed-Interface-Id 0:0:0:2> <Framed-IPv6-Prefix 2a00:5884:1100:3::/64> <Acct-Terminate-Cause NAS-Request>] > Oct 14 22:01:28 lns02 accel-pppd: ppp0:qu...@gr...el: pppd_compat: ip-down started (pid 19050) > Oct 14 22:01:28 lns02 accel-pppd: ppp1:un...@gr...el: send [RADIUS(1) Accounting-Request id=1 <User-Name "un...@gr...el"> <NAS-Identifier "accel-ppp"> <NAS-IP-Address 89.234.186.205> <NAS-Port 1> <NAS-Port-Id "ppp1"> <NAS-Port-Type Virtual> <Service-Type Framed-User> <Framed-Protocol PPP> <Calling-Station-Id "78.41.184.80"> <Called-Station-Id "89.234.186.12"> <Acct-Status-Type Stop> <Acct-Authentic RADIUS> <Acct-Session-Id "0dbb447ec6fa4c93"> <Acct-Session-Time 153647> <Acct-Input-Octets 391223201> <Acct-Output-Octets 3536240375> <Acct-Input-Packets 4975569> <Acct-Output-Packets 6891147> <Acct-Input-Gigawords 0> <Acct-Output-Gigawords 1> <Framed-IP-Address 89.234.186.39> <Framed-Interface-Id 6:6b84:6:6b7d> <Framed-IPv6-Prefix 2a00:5884:1100:7::/64> <Acct-Terminate-Cause NAS-Request>] > Oct 14 22:01:28 lns02 accel-pppd: ppp1:un...@gr...el: pppd_compat: ip-down started (pid 19051) > Oct 14 22:01:28 lns02 accel-pppd: ppp0:qu...@gr...el: pppd_compat: ip-down finished (0) > Oct 14 22:01:28 lns02 accel-pppd: ppp0:: session destroyed > Oct 14 22:01:28 lns02 accel-pppd: l2tp session 347-46302, 45186-40850: session destroyed > Oct 14 22:01:28 lns02 accel-pppd: ppp1:un...@gr...el: pppd_compat: ip-down finished (0) > Oct 14 22:01:28 lns02 accel-pppd: ppp1:: session destroyed > Oct 14 22:01:28 lns02 accel-pppd: l2tp session 347-46302, 61625-32197: session destroyed > Oct 14 22:01:28 lns02 accel-pppd: l2tp tunnel 347-46302 (78.41.184.80:1701): tunnel destroyed > Oct 14 22:01:28 lns02 bird: KRT: Received route 2a00:5884:1107::/48 with strange next-hop fe80::6:6b84:6:6b7d > Oct 14 22:01:28 lns02 accel-pppd: recv [RADIUS(1) Accounting-Response id=1] > Oct 14 22:01:28 lns02 accel-pppd: recv [RADIUS(1) Accounting-Response id=1] > Oct 14 22:01:41 lns02 accel-pppd: l2tp: discarding unexpected message from 78.41.184.80: invalid tid 347 > Oct 14 22:01:42 lns02 accel-pppd: l2tp: discarding unexpected message from 78.41.184.80: invalid tid 347 > > So, the two ADSL lines on this LNS at this moment went down, and the > same kind of message appears (unless the tunnel number was already > negotiated) > Makes sense. > The two hypervisors have the same kind of network cards > 01:00.0 Ethernet controller: Broadcom Limited NetXtreme II BCM5709 Gigabit Ethernet (rev 20) > or > 02:00.0 Ethernet controller: Broadcom Limited NetXtreme II BCM5716 Gigabit Ethernet (rev 20) > Do these physical interfaces have checksum offload activated? > So for me, the problem is on the checksums, but is located on the FreeBSD IP stack (again…). > > What I still don’t understand, is why xl2tpd isn’t affected, but I > think it’s not the right place to discuss about it ;) > Skimming through the xl2tp code, it seems that it disables UDP checksums. That could be a workaround, but I'd be better find the root issue. |
From: Alarig Le L. <al...@sw...> - 2018-10-14 20:19:01
|
On ven. 12 oct. 17:21:01 2018, Guillaume Nault wrote: > There are a few places where the bug might come from. Either the host > doesn't tell the driver that checksum isn't fully computed, or the > driver advertises checksum offload capabilities without actually > implementing it. > If you use virtualisation, the virtual NIC might advertise offload > support and rely on the physical NIC driver to actually perform the > computation. If the physical NIC doesn't offer this feature, the > virtual NIC should provide software fallback. > I suppose there is a bug somewhere in this chain. > > So I imagine something changed in your setup wrt. checksum offload, > either on the LNS or the hypervisor. The checksums seem to be enabled on both LNS: judicael-adsl ~ # ethtool -k eth0 | grep tx-checksum tx-checksumming: on tx-checksum-ipv4: off [fixed] tx-checksum-ip-generic: on tx-checksum-ipv6: off [fixed] tx-checksum-fcoe-crc: off [fixed] tx-checksum-sctp: off [fixed] lns02 ~ # ethtool -k eth2 | grep tx-checksum tx-checksumming: on tx-checksum-ipv4: off [fixed] tx-checksum-ip-generic: on tx-checksum-ipv6: off [fixed] tx-checksum-fcoe-crc: off [fixed] tx-checksum-sctp: off [fixed] But, as soon as I re-enable checksumming on the BGP router, I see the same kind of messages in the logs. root@bgp-adsl:~ # ifconfig vtnet2 rxcsum txcsum tso lro lns02 ~ # tail -F /var/log/daemon.log […] Oct 14 22:00:42 lns02 accel-pppd: l2tp tunnel 347-46302 (78.41.184.80:1701): retransmission #1 Oct 14 22:00:42 lns02 accel-pppd: l2tp tunnel 347-46302 (78.41.184.80:1701): retransmit (timeout) [L2TP tid=46302 sid=0 <Message-Type Hello>] Oct 14 22:00:44 lns02 accel-pppd: l2tp tunnel 347-46302 (78.41.184.80:1701): retransmission #2 Oct 14 22:00:44 lns02 accel-pppd: l2tp tunnel 347-46302 (78.41.184.80:1701): retransmit (timeout) [L2TP tid=46302 sid=0 <Message-Type Hello>] Oct 14 22:00:48 lns02 accel-pppd: l2tp tunnel 347-46302 (78.41.184.80:1701): retransmission #3 Oct 14 22:00:48 lns02 accel-pppd: l2tp tunnel 347-46302 (78.41.184.80:1701): retransmit (timeout) [L2TP tid=46302 sid=0 <Message-Type Hello>] Oct 14 22:00:56 lns02 accel-pppd: l2tp tunnel 347-46302 (78.41.184.80:1701): retransmission #4 Oct 14 22:00:56 lns02 accel-pppd: l2tp tunnel 347-46302 (78.41.184.80:1701): retransmit (timeout) [L2TP tid=46302 sid=0 <Message-Type Hello>] Oct 14 22:01:12 lns02 accel-pppd: l2tp tunnel 347-46302 (78.41.184.80:1701): retransmission #5 Oct 14 22:01:12 lns02 accel-pppd: l2tp tunnel 347-46302 (78.41.184.80:1701): retransmit (timeout) [L2TP tid=46302 sid=0 <Message-Type Hello>] Oct 14 22:01:28 lns02 accel-pppd: l2tp tunnel 347-46302 (78.41.184.80:1701): no acknowledgement from peer after 5 retransmissions, deleting tunnel Oct 14 22:01:28 lns02 accel-pppd: l2tp tunnel 347-46302 (78.41.184.80:1701): deleting tunnel Oct 14 22:01:28 lns02 accel-pppd: l2tp session 347-46302, 61625-32197: deleting session Oct 14 22:01:28 lns02 accel-pppd: l2tp session 347-46302, 61625-32197: deleting data channel Oct 14 22:01:28 lns02 accel-pppd: l2tp session 347-46302, 45186-40850: deleting session Oct 14 22:01:28 lns02 accel-pppd: l2tp session 347-46302, 45186-40850: deleting data channel Oct 14 22:01:28 lns02 accel-pppd: ppp0:qu...@gr...el: send [RADIUS(1) Accounting-Request id=1 <User-Name "qu...@gr...el"> <NAS-Identifier "accel-ppp"> <NAS-IP-Address 89.234.186.205> <NAS-Port 0> <NAS-Port-Id "ppp0"> <NAS-Port-Type Virtual> <Service-Type Framed-User> <Framed-Protocol PPP> <Calling-Station-Id "78.41.184.80"> <Called-Station-Id "89.234.186.12"> <Acct-Status-Type Stop> <Acct-Authentic RADIUS> <Acct-Session-Id "0dbb447ec6fa5cbf"> <Acct-Session-Time 73568> <Acct-Input-Octets 365708680> <Acct-Output-Octets 1036838156> <Acct-Input-Packets 2436312> <Acct-Output-Packets 4425180> <Acct-Input-Gigawords 0> <Acct-Output-Gigawords 1> <Framed-IP-Address 89.234.186.38> <Framed-Interface-Id 0:0:0:2> <Framed-IPv6-Prefix 2a00:5884:1100:3::/64> <Acct-Terminate-Cause NAS-Request>] Oct 14 22:01:28 lns02 accel-pppd: ppp0:qu...@gr...el: pppd_compat: ip-down started (pid 19050) Oct 14 22:01:28 lns02 accel-pppd: ppp1:un...@gr...el: send [RADIUS(1) Accounting-Request id=1 <User-Name "un...@gr...el"> <NAS-Identifier "accel-ppp"> <NAS-IP-Address 89.234.186.205> <NAS-Port 1> <NAS-Port-Id "ppp1"> <NAS-Port-Type Virtual> <Service-Type Framed-User> <Framed-Protocol PPP> <Calling-Station-Id "78.41.184.80"> <Called-Station-Id "89.234.186.12"> <Acct-Status-Type Stop> <Acct-Authentic RADIUS> <Acct-Session-Id "0dbb447ec6fa4c93"> <Acct-Session-Time 153647> <Acct-Input-Octets 391223201> <Acct-Output-Octets 3536240375> <Acct-Input-Packets 4975569> <Acct-Output-Packets 6891147> <Acct-Input-Gigawords 0> <Acct-Output-Gigawords 1> <Framed-IP-Address 89.234.186.39> <Framed-Interface-Id 6:6b84:6:6b7d> <Framed-IPv6-Prefix 2a00:5884:1100:7::/64> <Acct-Terminate-Cause NAS-Request>] Oct 14 22:01:28 lns02 accel-pppd: ppp1:un...@gr...el: pppd_compat: ip-down started (pid 19051) Oct 14 22:01:28 lns02 accel-pppd: ppp0:qu...@gr...el: pppd_compat: ip-down finished (0) Oct 14 22:01:28 lns02 accel-pppd: ppp0:: session destroyed Oct 14 22:01:28 lns02 accel-pppd: l2tp session 347-46302, 45186-40850: session destroyed Oct 14 22:01:28 lns02 accel-pppd: ppp1:un...@gr...el: pppd_compat: ip-down finished (0) Oct 14 22:01:28 lns02 accel-pppd: ppp1:: session destroyed Oct 14 22:01:28 lns02 accel-pppd: l2tp session 347-46302, 61625-32197: session destroyed Oct 14 22:01:28 lns02 accel-pppd: l2tp tunnel 347-46302 (78.41.184.80:1701): tunnel destroyed Oct 14 22:01:28 lns02 bird: KRT: Received route 2a00:5884:1107::/48 with strange next-hop fe80::6:6b84:6:6b7d Oct 14 22:01:28 lns02 accel-pppd: recv [RADIUS(1) Accounting-Response id=1] Oct 14 22:01:28 lns02 accel-pppd: recv [RADIUS(1) Accounting-Response id=1] Oct 14 22:01:41 lns02 accel-pppd: l2tp: discarding unexpected message from 78.41.184.80: invalid tid 347 Oct 14 22:01:42 lns02 accel-pppd: l2tp: discarding unexpected message from 78.41.184.80: invalid tid 347 So, the two ADSL lines on this LNS at this moment went down, and the same kind of message appears (unless the tunnel number was already negotiated) The two hypervisors have the same kind of network cards 01:00.0 Ethernet controller: Broadcom Limited NetXtreme II BCM5709 Gigabit Ethernet (rev 20) or 02:00.0 Ethernet controller: Broadcom Limited NetXtreme II BCM5716 Gigabit Ethernet (rev 20) So for me, the problem is on the checksums, but is located on the FreeBSD IP stack (again…). What I still don’t understand, is why xl2tpd isn’t affected, but I think it’s not the right place to discuss about it ;) Cheers, -- Alarig |
From: Guillaume N. <g....@al...> - 2018-10-12 15:21:14
|
On Sat, Oct 06, 2018 at 10:33:18AM +0200, Alarig Le Lay wrote: > On ven. 5 oct. 17:41:12 2018, Guillaume Nault wrote: > > Which kernel version is your Gentoo? Could you send a pcap of a > > successful tunnel establishment (one from the Gentoo and one from the > > FreeBSD, as you did in your previous message)? > > My Gentoo runs 4.14.65. > The offload on it is currently configured as following: > lns02 ~ # ethtool -k eth2 | grep offload > tcp-segmentation-offload: on > udp-fragmentation-offload: off > generic-segmentation-offload: on > generic-receive-offload: on > large-receive-offload: off [fixed] > rx-vlan-offload: off [fixed] > tx-vlan-offload: off [fixed] > l2-fwd-offload: off [fixed] > hw-tc-offload: off [fixed] > esp-hw-offload: off [fixed] > esp-tx-csum-hw-offload: off [fixed] > rx-udp_tunnel-port-offload: off [fixed] > > The pcap for the Gentoo (LNS) and the FreeBSD (BGP) are here: > https://bulbizarre.swordarmor.fr/garbage/documents/l2tp-bgp-up.pcap > https://bulbizarre.swordarmor.fr/garbage/documents/l2tp-lns-up.pcap > Ok, so I have taken a look at these network captures. Sorry for the delay. In the pcap taken on the LNS, the outer UDP checksum is partial. That is, it is filled with the pseudo-header's sum only, instead of the full checksum covering the whole data. That means you have checksum offload activated on this host. The driver is supposed to complete the checksum later, before it sends the packet on the wire. Your first LNS network trace exhibited the same behaviour (which is perfectly normal as tcpdump captures packets before the driver computes the checksum). So nothing changed on this side. However, the pcap files taken on the router differ. On the new pcap, the checksum seen on the router is correct. That is, the partial checksum has been properly completed before the packet was sent. But on the original pcap, the router did see the same partial checksum as seen on the LNS capture. So the driver sent it as-is, without finalising the computation. The problem is probably not related to accel-ppp or l2tp at all. For sending data on the control channel, a plain UDP socket is used. Therefore I believe that any UDP (and probably TCP) packet sent by this host in the original setup would have invalid checksum. There are a few places where the bug might come from. Either the host doesn't tell the driver that checksum isn't fully computed, or the driver advertises checksum offload capabilities without actually implementing it. If you use virtualisation, the virtual NIC might advertise offload support and rely on the physical NIC driver to actually perform the computation. If the physical NIC doesn't offer this feature, the virtual NIC should provide software fallback. I suppose there is a bug somewhere in this chain. So I imagine something changed in your setup wrt. checksum offload, either on the LNS or the hypervisor. > > I guess something is still wrong. It would be strange if it was the > > router that was at fault. > > My guess is that, by disabling offloads, the router stopped checking > > the UDP checksum of transit packets. If the LAC does not verify UDP > > checksums, then invalid checksums don't prevent the LAC and LNS from > > communicating. If you capture trafic on the router, do you still see > > invalid checksums for packets originated by the LNS? And correct > > checksums for packets sent by the other LNS? > > On the other LNS, I see [no chksum] for the L2TP packets, but [udp sum > ok] for the packets inside the L2TP. >From what I can see from the pcap, your provider disables checksums of L2TP data packets (outer header). This is common behaviour. Original packets are sent as-is, so the inner headers and checksums aren't modified. Guillaume |
From: Guillaume N. <g....@al...> - 2018-10-12 11:10:27
|
Define a new column, called "netns", that prints the network namespace in which sessions are set. Signed-off-by: Guillaume Nault <g....@al...> --- accel-pppd/cli/show_sessions.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/accel-pppd/cli/show_sessions.c b/accel-pppd/cli/show_sessions.c index 443b186a..a96a31d8 100644 --- a/accel-pppd/cli/show_sessions.c +++ b/accel-pppd/cli/show_sessions.c @@ -379,6 +379,11 @@ early_out: goto out; } +static void print_netns(struct ap_session *ses, char *buf) +{ + snprintf(buf, CELL_SIZE, "%s", ses->net->name); +} + static void print_ifname(struct ap_session *ses, char *buf) { snprintf(buf, CELL_SIZE, "%s", ses->ifname); @@ -633,6 +638,7 @@ static void init(void) cli_register_simple_cmd2(show_ses_exec, show_ses_help, 2, "show", "sessions"); + cli_show_ses_register("netns", "network namespace name", print_netns); cli_show_ses_register("ifname", "interface name", print_ifname); cli_show_ses_register("username", "user name", print_username); cli_show_ses_register("ip", "IP address", print_ip); -- 2.19.1 |
From: Guillaume N. <g....@al...> - 2018-10-09 17:45:57
|
On Fri, Sep 21, 2018 at 02:16:17PM +0200, Guillaume Nault wrote: > Several modules assume that if ses->ipv6 is set, then > ses->ipv6->addr_list contains at least one element. But this is not > true if ipv6 was allocated by the pseudo ipdb backend of ipv6cp > (ipv6cp_opt_intfid.c). That is, if the PPP session only has an > automatic link-local address. > I had no news for this patch. Is there anything wrong with it? |
From: Guillaume N. <g....@al...> - 2018-10-09 17:14:18
|
On Tue, Oct 09, 2018 at 06:58:31PM +0200, Guillaume Nault wrote: > The problem is that there is no synchronisation between the parent and > its child. When under stress, the child may execute faster than its > parent and the sigchld callback might run triton_context_wakeup() > before the parent had time to call triton_context_schedule(). > As an alternative fix, we could let triton_context_schedule() run a caller-provided function right before jumping back to the triton_thread. This way, the parent could unlock the sigchld module and yield in a race-free way. Then the child couldn't possibly run between both operations. Here's how it would look like. As per my tests both approaches work equally well. If the triton_context_schedule_after() approach is preferred, then I can re-submit the whole series. -------- 8< -------- diff --git a/accel-pppd/extra/pppd_compat.c b/accel-pppd/extra/pppd_compat.c index 318327e8..0d3ac031 100644 --- a/accel-pppd/extra/pppd_compat.c +++ b/accel-pppd/extra/pppd_compat.c @@ -252,9 +252,8 @@ static void ev_ses_pre_up(struct ap_session *ses) sigchld_register_handler(&pd->hnd); if (conf_verbose) log_ppp_info2("pppd_compat: ip-pre-up started (pid %i)\n", pid); - sigchld_unlock(); - triton_context_schedule(); + triton_context_schedule_after(sigchld_unlock); pthread_mutex_lock(&pd->hnd.lock); pthread_mutex_unlock(&pd->hnd.lock); @@ -368,9 +367,8 @@ static void ev_ses_finished(struct ap_session *ses) sigchld_register_handler(&pd->hnd); if (conf_verbose) log_ppp_info2("pppd_compat: ip-down started (pid %i)\n", pid); - sigchld_unlock(); - triton_context_schedule(); + triton_context_schedule_after(sigchld_unlock); pthread_mutex_lock(&pd->hnd.lock); pthread_mutex_unlock(&pd->hnd.lock); diff --git a/accel-pppd/triton/triton.c b/accel-pppd/triton/triton.c index d5bd01d3..2e8fb4ad 100644 --- a/accel-pppd/triton/triton.c +++ b/accel-pppd/triton/triton.c @@ -486,7 +486,7 @@ static ucontext_t * __attribute__((noinline)) alloc_context() return uc; } -void __export triton_context_schedule() +void __export triton_context_schedule_after(void (*func)(void)) { volatile struct _triton_context_t *ctx = (struct _triton_context_t *)this_ctx->tpd; @@ -512,6 +512,8 @@ void __export triton_context_schedule() } else { ctx->thread->ctx = NULL; spin_unlock(&threads_lock); + if (func) + func(); longjmp(jmp_env, 1); } } diff --git a/accel-pppd/triton/triton.h b/accel-pppd/triton/triton.h index 79bbee1b..2b2bf460 100644 --- a/accel-pppd/triton/triton.h +++ b/accel-pppd/triton/triton.h @@ -74,12 +74,17 @@ extern struct triton_stat_t triton_stat; int triton_context_register(struct triton_context_t *, void *arg); void triton_context_unregister(struct triton_context_t *); void triton_context_set_priority(struct triton_context_t *, int); -void triton_context_schedule(void); +void triton_context_schedule_after(void (*func)(void)); void triton_context_wakeup(struct triton_context_t *); int triton_context_call(struct triton_context_t *, void (*func)(void *), void *arg); void triton_cancel_call(struct triton_context_t *, void (*func)(void *)); struct triton_context_t *triton_context_self(void); +static inline void triton_context_schedule(void) +{ + triton_context_schedule_after(NULL); +} + #define MD_MODE_READ 1 #define MD_MODE_WRITE 2 |