#792 dns cache memory leak and TTL failure after failed conn

closed-fixed
libcurl (354)
5
2013-06-21
2008-12-10
Phil Lisiecki
No

After a failed connection attempt, the DNS cache entry appears never to be unlocked. This results in a memory leak and an ever-increasing inuse count. And, since the cache entry can only ever be pruned when both the TTL (DNS_CACHE_TIMEOUT) is expired and inuse=0, the hostname will never be reresolved for the lifetime of that DNS cache.

I have attached a simple program that demonstrates the problem. It takes a URL to download on the command line. Make sure the hostname resolves to an IP that just times out. If you change the hostname's resolution, libcurl continues trying to connect to the original IP forever (as observed by netstat, for example). Also, adding debug print to url.c shows the increasing "inuse" count.

My patch to solve this issue is included below, though you may prefer to solve the problem a different way. (I have only lightly tested the patch at this point.)

--- curl-7.19.2.orig/lib/url.c Tue Dec 9 23:10:34 2008
+++ curl-7.19.2/lib/url.c Wed Dec 10 03:16:48 2008
@@ -2699,6 +2699,7 @@ ConnectionStore(struct SessionHandle *da
static CURLcode ConnectPlease(struct SessionHandle *data,
struct connectdata *conn,
struct Curl_dns_entry *hostaddr,
+ bool *hostaddr_used,
bool *connected)
{
CURLcode result;
@@ -2721,6 +2722,12 @@ static CURLcode ConnectPlease(struct Ses
connected);
if(CURLE_OK == result) {
/* All is cool, then we store the current information */
+ if (conn->dns_entry != hostaddr) {
+ if (conn->dns_entry && !*hostaddr_used)
+ Curl_resolv_unlock(data, conn->dns_entry);
+ if (hostaddr)
+ *hostaddr_used = TRUE;
+ }
conn->dns_entry = hostaddr;
conn->ip_addr = addr;

@@ -4354,6 +4361,7 @@ static CURLcode create_conn(struct Sessi

static CURLcode setup_conn(struct connectdata *conn,
struct Curl_dns_entry *hostaddr,
+ bool *hostaddr_used,
bool *protocol_done)
{
CURLcode result=CURLE_OK;
@@ -4406,7 +4414,8 @@ static CURLcode setup_conn(struct connec
* only for the case where we re-use an existing connection and thus
* this code section will not be reached with hostaddr == NULL.
*/
- result = ConnectPlease(data, conn, hostaddr, &connected);
+ result = ConnectPlease(data, conn, hostaddr, hostaddr_used, &connected);
+ free_hostaddr = (result != CURLE_OK);

if(connected) {
result = Curl_protocol_connect(conn, protocol_done);
@@ -4468,6 +4477,7 @@ CURLcode Curl_connect(struct SessionHand
{
CURLcode code;
struct Curl_dns_entry *dns;
+ bool dns_used = FALSE;

*asyncp = FALSE; /* assume synchronous resolves by default */

@@ -4485,12 +4495,16 @@ CURLcode Curl_connect(struct SessionHand
/* If an address is available it means that we already have the name
resolved, OR it isn't async. if this is a re-used connection 'dns'
will be NULL here. Continue connecting from here */
- code = setup_conn(*in_connect, dns, protocol_done);
+ code = setup_conn(*in_connect, dns, &dns_used, protocol_done);
/* else
response will be received and treated async wise */
}
}

+ /* Free dns entry if it was not used */
+ if (dns && !dns_used)
+ Curl_resolv_unlock(data, dns);
+
if(CURLE_OK != code && *in_connect) {
/* We're not allowed to return failure with memory left allocated
in the connectdata struct, free those here */
@@ -4511,7 +4525,11 @@ CURLcode Curl_async_resolved(struct conn
{
#if defined(USE_ARES) || defined(USE_THREADING_GETHOSTBYNAME) || \ defined(USE_THREADING_GETADDRINFO)
- CURLcode code = setup_conn(conn, conn->async.dns, protocol_done);
+ bool dns_used = FALSE;
+ CURLcode code = setup_conn(conn, conn->async.dns, &dns_used, protocol_done);
+
+ if (conn->async.dns && !dns_used)
+ Curl_resolv_unlock(data, conn->async.dns);

if(code)
/* We're not allowed to return failure with memory left allocated

Discussion

1 2 3 > >> (Page 1 of 3)
  • Phil Lisiecki
    Phil Lisiecki
    2008-12-10

    Simple example program

     
    Attachments
  • Phil Lisiecki
    Phil Lisiecki
    2008-12-10

    Revised patch (one line from a previous solution attempt slipped into the previous patch accidentally):

    --- curl-7.19.2.orig/lib/url.c Tue Dec 9 23:10:34 2008
    +++ curl-7.19.2/lib/url.c Wed Dec 10 03:16:48 2008
    @@ -2699,6 +2699,7 @@ ConnectionStore(struct SessionHandle *da
    static CURLcode ConnectPlease(struct SessionHandle *data,
    struct connectdata *conn,
    struct Curl_dns_entry *hostaddr,
    + bool *hostaddr_used,
    bool *connected)
    {
    CURLcode result;
    @@ -2721,6 +2722,12 @@ static CURLcode ConnectPlease(struct Ses
    connected);
    if(CURLE_OK == result) {
    /* All is cool, then we store the current information */
    + if (conn->dns_entry != hostaddr) {
    + if (conn->dns_entry && !*hostaddr_used)
    + Curl_resolv_unlock(data, conn->dns_entry);
    + if (hostaddr)
    + *hostaddr_used = TRUE;
    + }
    conn->dns_entry = hostaddr;
    conn->ip_addr = addr;

    @@ -4354,6 +4361,7 @@ static CURLcode create_conn(struct Sessi

    static CURLcode setup_conn(struct connectdata *conn,
    struct Curl_dns_entry *hostaddr,
    + bool *hostaddr_used,
    bool *protocol_done)
    {
    CURLcode result=CURLE_OK;
    @@ -4406,7 +4414,7 @@ static CURLcode setup_conn(struct connec
    * only for the case where we re-use an existing connection and thus
    * this code section will not be reached with hostaddr == NULL.
    */
    - result = ConnectPlease(data, conn, hostaddr, &connected);
    + result = ConnectPlease(data, conn, hostaddr, hostaddr_used, &connected);

    if(connected) {
    result = Curl_protocol_connect(conn, protocol_done);
    @@ -4468,6 +4476,7 @@ CURLcode Curl_connect(struct SessionHand
    {
    CURLcode code;
    struct Curl_dns_entry *dns;
    + bool dns_used = FALSE;

    *asyncp = FALSE; /* assume synchronous resolves by default */

    @@ -4485,12 +4494,16 @@ CURLcode Curl_connect(struct SessionHand
    /* If an address is available it means that we already have the name
    resolved, OR it isn't async. if this is a re-used connection 'dns'
    will be NULL here. Continue connecting from here */
    - code = setup_conn(*in_connect, dns, protocol_done);
    + code = setup_conn(*in_connect, dns, &dns_used, protocol_done);
    /* else
    response will be received and treated async wise */
    }
    }

    + /* Free dns entry if it was not used */
    + if (dns && !dns_used)
    + Curl_resolv_unlock(data, dns);
    +
    if(CURLE_OK != code && *in_connect) {
    /* We're not allowed to return failure with memory left allocated
    in the connectdata struct, free those here */
    @@ -4511,7 +4524,11 @@ CURLcode Curl_async_resolved(struct conn
    {
    #if defined(USE_ARES) || defined(USE_THREADING_GETHOSTBYNAME) || \ defined(USE_THREADING_GETADDRINFO)
    - CURLcode code = setup_conn(conn, conn->async.dns, protocol_done);
    + bool dns_used = FALSE;
    + CURLcode code = setup_conn(conn, conn->async.dns, &dns_used, protocol_done);
    +
    + if (conn->async.dns && !dns_used)
    + Curl_resolv_unlock(data, conn->async.dns);

    if(code)
    /* We're not allowed to return failure with memory left allocated

     
  • Thanks for the report and patch.

    Can you please submit it as an attachment to this report, as it gets manged by this comment system when inlined and thus is somewhat hard to extract and apply...

     
  • Phil Lisiecki
    Phil Lisiecki
    2008-12-11

    Proposed patch

     
    Attachments
  • Phil Lisiecki
    Phil Lisiecki
    2008-12-11

    Posting patch as an attachment as requested
    File Added: patch

     
  • my edited (pure C) version that shows no leak to me

     
    Attachments
  • I just uploaded my version of your test code "2413067.c" which is a C version that exits and it does not show any leaks to me. Not with libcurls own leak tracker nor with valgrind. Using the current libcurl in CVS built on linux with c-ares.

    How do you "prove" the leak? What resolver do you use? What operating system do you use? Do you see a leak with my version of the test too?

     
  • Phil Lisiecki
    Phil Lisiecki
    2008-12-13

    I just uploaded my version of your test code "2413067.c" which is a C
    version that exits and it does not show any leaks to me. Not with libcurls
    own leak tracker nor with valgrind. Using the current libcurl in CVS built
    on linux with c-ares.

    How do you "prove" the leak? What resolver do you use? What operating
    system do you use? Do you see a leak with my version of the test too?

    First the simple stuff. I am using a bind9 under a linux-2.4-based
    kernel, gcc-3.3. "dig" confirms that the DNS entries are updating
    successfully on my client machine. I do see the TTL problem with your
    example, but the "leak" is hard to see with just one hostname...

    I should have been more specific with my definition of "leak." All of
    the allocated memory is associated with the DNS cache, which in turn
    is associated with the easy handle. So, curl_easy_cleanup() will find
    all of the memory that was allocated and free it appropriately. But,
    a long-running process which uses only a single easy handle will use
    an amount of memory proportional to the number of host names it
    resolves to broken IPs. As I understand the intended memory usage,
    only cache entries used in the last DNS_CACHE_TIMEOUT should be kept
    around.

    The original problem which I observed in my application that led me to
    investigate the problem was actually the broken TTL issue. My
    application repeatedly makes requests against some particular
    hostname. This host died, so I updated the DNS record to point to a
    new IP, but my application continued connecting to the original host
    forever (until I restarted the daemon by hand).

    Reproducing the TTL problem:

    I augmented your C example to print the PRIMARY_IP after each trial
    and to sleep 20 seconds between trials. (2413067_ttl.c)

    Then, I set up a domain name that points to a nonexistent host with a
    30-second TTL and started the test program and saw that it failed to
    connect to the bad IP. Then, I change the domain name and confirmed
    that the machine's local named (/etc/resolv.conf points to 127.0.0.1)
    had picked up the update within 30 seconds. But, the example program
    continued to print the wrong IP even after another 60 seconds had
    elapsed. "netstat" also shows "SYN_SENT" to the old IP as well, so
    this isn't just a bug with the PRIMARY_IP interface.

    Also, if you attach gdb or add debug print, you will see that the
    "inuse" counter on the host entry continues to increase.

    Reproducing the "leak":

    To observe the actual "leak," I have made another slightly modified
    version of your test program (2413067_leak.c). This program loops
    over ten domain names (by inserting the numbers 1-10 between argv[1]
    and argv[2]), with a 20-second sleep between hosts. If all 10
    hostnames resolve to nonexistent hosts, you would expect the cache
    size to grow to about 3-4 entries (I am using a post on the same
    subnet such that connect times out after 3 seconds; off subnet, the
    timeout will be much longer, such that you might expect 2 or fewer
    entries). To show the number of entries, I added a printf to
    Curl_hostcache_prune (patch 2413067_leak_debug_print_patch). Instead,
    the cache grows all the way to 10 entries. You could extend this to
    an arbitrary number of entries if you want...

    Here are the post-prune cache sizes I saw:

    post-prune cache size: 1
    post-prune cache size: 2
    post-prune cache size: 3
    post-prune cache size: 4
    post-prune cache size: 5
    post-prune cache size: 6
    post-prune cache size: 7
    post-prune cache size: 8
    post-prune cache size: 9
    post-prune cache size: 10

    Note that the expected cache behavior happens with a successful
    connection. I used this "web server" to answer curl's requests:

    while true; do (echo 'HTTP/1.0 200 OK'; echo 'Connection: close'; echo 'Content-Length: 0'; echo) | nc -l -p 80; echo hi; done

    post-prune cache size: 1
    post-prune cache size: 2
    post-prune cache size: 3
    post-prune cache size: 3
    post-prune cache size: 3
    post-prune cache size: 3
    post-prune cache size: 3
    post-prune cache size: 3
    post-prune cache size: 3
    post-prune cache size: 3

    File Added: 2413067_ttl.c

     
  • Phil Lisiecki
    Phil Lisiecki
    2008-12-13

    TTL problem example

     
    Attachments
  • Phil Lisiecki
    Phil Lisiecki
    2008-12-13

    Another attachment...
    File Added: 2413067_leak.c

     
1 2 3 > >> (Page 1 of 3)