From: Lachlan A. <lh...@ee...> - 2002-06-24 01:54:32
|
On Fri, Jun 14, 2002 at 06:03:09PM -0500, Gilles Detillieux wrote: > I'd recommend two changes: > 1) Grab the most recent 3.2.0b4 snapshot > 2) The HtFile::Request() and Document::RetrieveLocal() methods both > have some hardcoded extensions, which should probably be kept in the > new HtFile::Ext2Mime() method. HtFile::Request() currently falls back > on these when it can't open mime.types. Greetings, Below is the patch against 3.2.0b4-20020616. This includes the hardcoded types, and bad_local_extensions to allow .php etc. not to be parsed locally. If bad_local_extensions is explicitly set empty, do you think it would be good to allow *all* files to be parsed locally (even those with no extensions)? Of course, ones for which no MIME type is known would have to be treated as text/plain but it would be good if a site has a lot of text files with no extensions. Also, would there be any demand to index compressed files? If someone has a lot of .ps.gz files, for example, it could be useful to include them in the index. Finally, I think someon has been editing the files with a tab size other than 8... Is there a policy on that? Cheers, Lachlan *** htdig/Document.cc Sun Jan 13 19:13:13 2002 --- htdig/Document.cc.lha Mon Jun 24 01:06:48 2002 *************** *** 72,78 **** FileConnect = 0; NNTPConnect = 0; externalConnect = 0; ! HtConfiguration* config= HtConfiguration::config(); // We probably need to move assignment of max_doc_size, according // to a server or url configuration value. The same is valid for --- 72,78 ---- FileConnect = 0; NNTPConnect = 0; externalConnect = 0; ! HtConfiguration* config= HtConfiguration::config(); // We probably need to move assignment of max_doc_size, according // to a server or url configuration value. The same is valid for *************** *** 549,555 **** Transport::DocStatus Document::RetrieveLocal(HtDateTime date, StringList *filenames) { ! HtConfiguration* config= HtConfiguration::config(); struct stat stat_buf; String *filename; --- 549,555 ---- Transport::DocStatus Document::RetrieveLocal(HtDateTime date, StringList *filenames) { ! HtConfiguration* config= HtConfiguration::config(); struct stat stat_buf; String *filename; *************** *** 558,564 **** // Loop through list of potential filenames until the list is exhausted // or a suitable file is found to exist as a regular file. while ((filename = (String *)filenames->Get_Next()) && ! ((stat((char*)*filename, &stat_buf) == -1) || !S_ISREG(stat_buf.st_mode))) if (debug > 1) cout << " tried local file " << *filename << endl; --- 558,564 ---- // Loop through list of potential filenames until the list is exhausted // or a suitable file is found to exist as a regular file. while ((filename = (String *)filenames->Get_Next()) && ! ((stat((char*)*filename, &stat_buf) == -1) || !S_ISREG(stat_buf.st_mode))) if (debug > 1) cout << " tried local file " << *filename << endl; *************** *** 572,593 **** if (modtime <= date) return Transport::Document_not_changed; - // Process only HTML files (this could be changed if we read - // the server's mime.types file). - // (...and handle a select few other types for now... this should - // eventually be handled by the "file://..." handler, which uses - // mime.types to determine the file type.) -- FIXME!! char *ext = strrchr((char*)*filename, '.'); if (ext == NULL) return Transport::Document_not_local; ! if ((mystrcasecmp(ext, ".html") == 0) || (mystrcasecmp(ext, ".htm") == 0)) ! contentType = "text/html"; ! else if ((mystrcasecmp(ext, ".txt") == 0) || (mystrcasecmp(ext, ".asc") == 0)) ! contentType = "text/plain"; ! else if ((mystrcasecmp(ext, ".pdf") == 0)) ! contentType = "application/pdf"; ! else if ((mystrcasecmp(ext, ".ps") == 0) || (mystrcasecmp(ext, ".eps") == 0)) ! contentType = "application/postscript"; else return Transport::Document_not_local; --- 572,585 ---- if (modtime <= date) return Transport::Document_not_changed; char *ext = strrchr((char*)*filename, '.'); + if (ext && strchr(ext,'/')) // Ignore a dot if it's not in the + ext = NULL; // final component of the path. if (ext == NULL) return Transport::Document_not_local; ! const String *type = HtFile::Ext2Mime (ext + 1); ! if (type != NULL) ! contentType = *type; else return Transport::Document_not_local; *** htnet/HtFile.h Mon Jun 24 01:02:42 2002 --- htnet/HtFile.h.lha Mon Jun 24 01:02:51 2002 *************** *** 64,69 **** --- 64,73 ---- // manages a Transport request (method inherited from Transport class) virtual DocStatus Request (); + // Determine Mime type of file + // (Does it belong here??) + static const String *Ext2Mime (const char *); + /////// // Interface for resource retrieving /////// *** htnet/HtFile.cc Sun Dec 23 19:13:14 2001 --- htnet/HtFile.cc.lha Mon Jun 24 00:48:34 2002 *************** *** 76,96 **** } ! /////// ! // Manages the requesting process ! /////// ! ! HtFile::DocStatus HtFile::Request() { - HtConfiguration* config= HtConfiguration::config(); static Dictionary *mime_map = 0; if (!mime_map) { mime_map = new Dictionary(); ifstream in(config->Find("mime_types").get()); if (in) { String line; while (in >> line) { --- 76,110 ---- } ! // Return mime type indicated by extension ext (which is assumed not ! // to contain the '.'), or NULL if ext is not a know mime type, or ! // is listed in bad_local_extensions. ! const String *HtFile::Ext2Mime (const char *ext) { static Dictionary *mime_map = 0; if (!mime_map) { + HtConfiguration* config= HtConfiguration::config(); mime_map = new Dictionary(); + if (!mime_map) + return NULL; + + if (debug > 2) + cout << "MIME types: " << config->Find("mime_types").get() << endl; ifstream in(config->Find("mime_types").get()); if (in) { + // Set up temporary dictionary of extensions not to parse locally + Dictionary bad_local_exts; + StringList split_exts(config->Find("bad_local_extensions"), "\t ."); + for (int i = 0; i < split_exts.Count(); i++) + { + if (debug > 3) + cout << "Bad local extension: " << split_exts[i] << endl; + bad_local_exts.Add(split_exts[i], 0); + } + String line; while (in >> line) { *************** *** 99,114 **** if ((cmt = line.indexOf('#')) >= 0) line = line.sub(0, cmt); StringList split_line(line, "\t "); ! // Let's cache mime type to lesser the number of ! // operator [] callings String mime_type = split_line[0]; // Fill map with values. for (int i = 1; i < split_line.Count(); i++) ! mime_map->Add(split_line[i], new String(mime_type)); } } } // Reset the response _response.Reset(); --- 113,161 ---- if ((cmt = line.indexOf('#')) >= 0) line = line.sub(0, cmt); StringList split_line(line, "\t "); ! // cache mime type to lessen the number of operator [] callings String mime_type = split_line[0]; // Fill map with values. for (int i = 1; i < split_line.Count(); i++) ! { ! const char *ext = split_line [i]; ! if (bad_local_exts.Exists(ext)) ! { ! if (debug > 3) ! cout << "Bad local extension: " << ext << endl; ! continue; ! } ! ! if (debug > 3) ! cout << "MIME: " << ext << "\t-> " << mime_type << endl; ! mime_map->Add(ext, new String(mime_type)); ! } } } + else + { + if (debug > 2) + cout << "MIME types file not found. Using default types.\n"; + mime_map->Add(String("html"), new String("text/html")); + mime_map->Add(String("htm"), new String("text/html")); + mime_map->Add(String("txt"), new String("text/plain")); + mime_map->Add(String("asc"), new String("text/plain")); + mime_map->Add(String("pdf"), new String("application/pdf")); + mime_map->Add(String("ps"), new String("application/postscript")); + mime_map->Add(String("eps"), new String("application/postscript")); + } } + // return MIME type, or NULL if not found + return (String *)mime_map->Find(ext); + } + + /////// + // Manages the requesting process + /////// + + HtFile::DocStatus HtFile::Request() + { // Reset the response _response.Reset(); *************** *** 166,191 **** return Transport::Document_not_changed; char *ext = strrchr(_url.path(), '.'); if (ext == NULL) return Transport::Document_not_local; ! if (mime_map && mime_map->Count()) ! { ! String *mime_type = (String *)mime_map->Find(ext + 1); ! if (mime_type) ! _response._content_type = *mime_type; ! else ! return Transport::Document_not_local; ! } else ! { ! if ((mystrcasecmp(ext, ".html") == 0) || (mystrcasecmp(ext, ".htm") == 0)) ! _response._content_type = "text/html"; ! else if (mystrcasecmp(ext, ".txt") == 0) ! _response._content_type = "text/plain"; ! else ! return Transport::Document_not_local; ! } _response._modification_time = new HtDateTime(stat_buf.st_mtime); --- 213,228 ---- return Transport::Document_not_changed; char *ext = strrchr(_url.path(), '.'); + if (ext && strchr(ext,'/')) // Ignore a dot if it's not in the + ext = NULL; // final component of the path. if (ext == NULL) return Transport::Document_not_local; ! const String *mime_type = Ext2Mime(ext + 1); ! if (mime_type) ! _response._content_type = *mime_type; else ! return Transport::Document_not_local; _response._modification_time = new HtDateTime(stat_buf.st_mtime); *** htcommon/defaults.cc Sun Jun 23 23:55:41 2002 --- htcommon/defaults.cc.lha Mon Jun 24 01:01:09 2002 *************** *** 145,151 **** documents as text while they are some binary format. \ If the list is empty, then all extensions are acceptable, \ provided they pass other criteria for acceptance or rejection. \ ! See also <a href=\"#valid_extensions\">valid_extensions</a>. \ " }, \ { "bad_querystr", "", \ "pattern list", "htdig", "URL", "3.1.0", "Indexing:Where", "bad_querystr: forum=private section=topsecret&passwd=required", " \ --- 145,165 ---- documents as text while they are some binary format. \ If the list is empty, then all extensions are acceptable, \ provided they pass other criteria for acceptance or rejection. \ ! See also <a href=\"#valid_extensions\">valid_extensions</a> and \ ! <a href=\"#bad_local_extensions\">bad_local_extensions</a>. \ ! " }, \ ! { "bad_local_extensions", ".php .shtml", \ ! "string list", "htdig", "URL", "all", "Indexing:Where", "bad_local_extensions: .php .foo .bar", " \ ! This is a list of extensions on URLs which are \ ! considered active, that is, the content delivered by the web \ ! server is not simply the text of the file, but is generated \ ! on-the-fly. This list is used mainly to allow URLs on the local \ ! machine to be read using the local filesystem, rather than \ ! through HTTP. \ ! If the list is empty, then all extensions are acceptable, \ ! provided they pass other criteria for acceptance or rejection. \ ! See also <a href=\"#valid_extensions\">valid_extensions</a> and \ ! <a href=\"#bad_extensions\">bad_extensions</a>. \ " }, \ { "bad_querystr", "", \ "pattern list", "htdig", "URL", "3.1.0", "Indexing:Where", "bad_querystr: forum=private section=topsecret&passwd=required", " \ -- Lachlan Andrew lh...@ee... Phone: +613 8344-3816 Fax: +613 8344-6678 Department of Electrical and Electronic Engineering CRICOS Provider Code University of Melbourne, Victoria, 3010 AUSTRALIA 00116K |