[ekhtml-devel] Status
Brought to you by:
jick
|
From: Todd F. <ta...@le...> - 2003-04-30 06:46:49
|
Hi,
I recently downloaded the release version 0.32 of ekhtml. In using
it i'm having a little difficulty...
the example application runs just fine and if I use the library to do a
single parse, so streamming it works...
however, when i try to use it for streaming i get a core dump and it
would appear that the library is
returning in one of my event handlers a bad string pointer. Anyways,
here's how i'm using the library
and here's the stack trace.
I also have a question about initializing callbacks, is the library case
insensitve when i set tag handlers
like in the following init method? Obviously, i can figure this out with
a few quick tests but since
i'm asking i figured it should be easy enough to answer...
Thanks in advance,
-todd
void PageProc::init()
{
ekhtml_parser_datacb_set ( ekparser, handle_clear_text );
ekhtml_parser_startcb_add ( ekparser, "title", handle_title_start );
ekhtml_parser_endcb_add ( ekparser, "title", handle_title_end );
ekhtml_parser_startcb_add ( ekparser, "TITLE", handle_title_start );
ekhtml_parser_endcb_add ( ekparser, "TITLE", handle_title_end );
ekhtml_parser_startcb_add ( ekparser, "a", handle_a_tag_start );
ekhtml_parser_startcb_add ( ekparser, "A", handle_a_tag_start );
ekhtml_parser_startcb_add ( ekparser, "meta", handle_meta_tag_start );
ekhtml_parser_startcb_add ( ekparser, "META", handle_meta_tag_start );
}
void PageProc::
parse_init( Page *p, HostIndex *hindex )
{
parse_data = new ParseData;
parse_data->check_is_html = true;
parse_data->is_html = true;
parse_data->noindex = false;
parse_data->nofollow = false;
parse_data->add_to_title = false;
parse_data->page = p;
parse_data->host_index = hindex;
this->ekparser = ekhtml_parser_new( NULL );
init();
ekhtml_parser_cbdata_set( this->ekparser, p );
}
void PageProc::
parse_feed( const char *new_bytes, size_t bytes )
{
if( parse_data->check_is_html ){ // first call check if the doc is html
parse_data->check_is_html = false; // only check once
parse_data->is_html = html_check( new_bytes, bytes );
parse_data->page->type = "HTML";
}
else{
parse_data->page->type = "UNKOWN";
}
if( parse_data->is_html ){
ekhtml_string_t str;
str.str = new_bytes;
str.len = bytes;
fprintf( stderr, "parsing document\n" );
ekhtml_parser_feed( this->ekparser, &str );
fprintf( stderr, "flushing\n" );
ekhtml_parser_flush( this->ekparser, 0 );
}
fprintf( stderr, "Appending to content buffer\n" );
// if its html or not always store the new
// document because we can later do a more
// extensive analysis of the doc type
parse_data->page->content.append( new_bytes, bytes );
fprintf( stderr, "done\n" );
}
void PageProc::
parse_close()
{
ekhtml_parser_flush( ekparser, 1 );
ekhtml_parser_destroy( ekparser );
// set the mod date for the page
time_t rawtime;
time ( &rawtime );
parse_data->page->mod_date = asctime( localtime( &rawtime ) );
// free our parse data
delete parse_data;
}
0 0x4207c45c in memcpy () from /lib/tls/libc.so.6
#1 0x4029b40e in std::string::_Rep::_M_clone(std::allocator<char>
const&, unsigned) () from /usr/lib/libstdc++.so.5
#2 0x40299146 in std::string::reserve(unsigned) () from
/usr/lib/libstdc++.so.5
#3 0x40299642 in std::string::append(char const*, unsigned) () from
/usr/lib/libstdc++.so.5
#4 0x0804ed1a in handle_clear_text (cbdata=0x806c000, str=0x402c939c)
at PageProc.cc:195
#5 0x40218fc2 in ekhtml_parse_special (parser=0x4214abdc,
state_data=0x806716c,
curp=0x200fa64 <Address 0x200fa64 out of bounds>, endp=0x42134014
"", baddata=0xbfffdc64) at ekhtml_special.c:64
#6 0x402185ec in ekhtml_parser_flush (parser=0x8067120, flushall=0) at
ekhtml.c:173
#7 0x0804e4fb in PageProc::parse_feed(char const*, unsigned)
(this=0xbfffdfc0,
new_bytes=0x805dc91 "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01
Transitional//EN\">\n<html lang=\"en\">\n<head>\n\t<meta
http-equiv=\"content-type\" content=\"text/html;
charset=iso-8859-1\">\n\t<meta http-equiv=\"refresh\" content=\"1"...,
bytes=517) at PageProc.cc:54
#8 0x0804ee16 in retrieve_doc(void*, unsigned, unsigned, PageProc*)
(ptr=0x805dc91, size=1, nmemb=517, parser=0xbfffdfc0)
at PageProc.cc:213
#9 0x400227c4 in Curl_client_write (data=0x805d610, type=1,
ptr=0x805dc91 "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01
Transitional//EN\">\n<html lang=\"en\">\n<head>\n\t<meta
http-equiv=\"content-type\" content=\"text/html;
charset=iso-8859-1\">\n\t<meta http-equiv=\"refresh\" content=\"1"...,
len=517) at sendf.c:309
#10 0x400322bf in Curl_httpchunk_read (conn=0x8068fe0,
datap=0x805dc91 "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01
Transitional//EN\">\n<html lang=\"en\">\n<head>\n\t<meta
http-equiv=\"content-type\" content=\"text/html;
charset=iso-8859-1\">\n\t<meta http-equiv=\"refresh\" content=\"1"...,
length=1407, wrote=0xbfffddb4) at http_chunks.c:183
#11 0x4003052c in Curl_readwrite (conn=0x8068fe0, done=0xbfffdf0f "") at
transfer.c:841
#12 0x4003155d in Transfer (conn=0x8068fe0) at transfer.c:1318
#13 0x40031b3a in Curl_perform (data=0x805d610) at transfer.c:1657
#14 0x40031ebc in curl_easy_perform (curl=0x805d610) at easy.c:247
#15 0x0804a5df in main () at page_proc_test.cc:22
#16 0x420156a4 in __libc_start_main () from /lib/tls/libc.so.6
|