From: Jamie A. <jam...@sl...> - 2001-10-18 01:54:09
|
Here's another quickie patch to 3.2.x. It's support for a content-type alias attribute for htdig which sorts out servers that get the content-type wrong in responses. While getting the server correctly configured is probably a better solution, this works if the server is maintained by someone else. usage is something like: content_type_aliases: text/plain=text/html and can be set for specific servers in the server block. Also included is the patch as an attachment for when the c+p gets mangled. Is the config file entry in the right format? I've not seen a definitive description of what should be where, so it's a bit of a guess. ======================================= diff -rup htdig/htcommon/defaults.cc htdig-patch3/htcommon/defaults.cc --- htdig/htcommon/defaults.cc Thu Aug 30 03:43:38 2001 +++ htdig-patch3/htcommon/defaults.cc Thu Oct 18 14:49:27 2001 @@ -271,6 +271,13 @@ http://www.htdig.org/", " \ compile time. \ </p> \ " }, \ +{ "content_type_aliases", "", \ + "string list", "htdig", "server", "SLI-special", "Indexing:Where", \ + "content_type_aliases: text/plain=text/html", " \ + This attribute tells htdig to use a different parser to that indicated by the content-type \ + returned by the server. This is occasionally useful for mis-configured servers who server \ + up dynamic content but don't set the content-type correctly. \ +" }, \ { "create_image_list", "false", \ "boolean", "htdig", "", "all", "Extra Output", "create_image_list: yes", " \ If set to true, a file with all the image URLs that \ @@ -2545,6 +2552,8 @@ form during indexing and translated for "string", "all", "", "3.2.0b1", "Extra Output", "wordlist_monitor_output: myfile", " \ Print monitoring output on file instead of the default stderr. \ " }, + + {0, 0, 0, 0, 0, 0, 0, 0, 0} }; Only in htdig-patch3/htcommon: defaults.cc~ diff -rup htdig/htdig/Document.cc htdig-patch3/htdig/Document.cc --- htdig/htdig/Document.cc Thu May 17 04:36:44 2001 +++ htdig-patch3/htdig/Document.cc Thu Oct 18 14:27:37 2001 @@ -629,7 +629,7 @@ Document::RetrieveLocal(HtDateTime date, // parsers are external programs that will be used. // Parsable * -Document::getParsable() +Document::getParsable( const String& serverName ) { static HTML *html = 0; static Plaintext *plaintext = 0; @@ -637,6 +637,8 @@ Document::getParsable() Parsable *parsable = 0; + ContentTypeAlias( serverName ); + if (ExternalParser::canParse(contentType)) { if (externalParser) @@ -701,4 +703,51 @@ int Document::ShouldWeRetry(Transport::D return 1; return 0; +} + + + +void +Document::ContentTypeAlias( const String& serverName ) +{ + HtConfiguration* config= HtConfiguration::config(); + Dictionary content_type_aliases; + + String l; + if ( serverName.length() > 0 ) + l = config->Find("server", serverName, "content_type_aliases"); + else + l = config->Find( "content_type_aliases"); + + if ( l.length() == 0 ) + return; + + String from, *to; + char *p = strtok(l, " \t"); + char *ct_alias= NULL; + while (p) + { + ct_alias = strchr(p, '='); + if (! ct_alias ) + { + p = strtok(0, " \t"); + continue; + } + *ct_alias++= '\0'; + from = p; + to= new String( ct_alias ); + content_type_aliases.Add(from.get(), to); + // fprintf (stderr, "Alias: %s->%s\n", from.get(), to->get()); + p = strtok(0, " \t"); + } + + + String* new_ct = 0; + if ( (new_ct = (String*) content_type_aliases.Find( contentType )) ) + { + if ( debug > 1 ) + cout << "Translating content type '" << contentType << "' to '" << *new_ct << "'\n"; + contentType = *new_ct; + } + } diff -rup htdig/htdig/Retriever.cc htdig-patch3/htdig/Retriever.cc --- htdig/htdig/Retriever.cc Tue Oct 16 15:27:16 2001 +++ htdig-patch3/htdig/Retriever.cc Thu Oct 18 14:28:29 2001 @@ -802,7 +802,7 @@ Retriever::RetrievedDocument(Document &d // routines. // This will generate the Parsable object as a specific parser // - Parsable *parsable = doc.getParsable(); + Parsable *parsable = doc.getParsable( base->host() ); if (parsable) parsable->parse(*this, *base); else ======================================= Jamie Anstice Search Engineer S.L.I. Systems jam...@sl... ph: 64 961 3262 mobile: 64 21 264 9347 |