|
From: Lachlan A. <lh...@ee...> - 2002-06-24 01:54:32
|
On Fri, Jun 14, 2002 at 06:03:09PM -0500, Gilles Detillieux wrote:
> I'd recommend two changes:
> 1) Grab the most recent 3.2.0b4 snapshot
> 2) The HtFile::Request() and Document::RetrieveLocal() methods both
> have some hardcoded extensions, which should probably be kept in the
> new HtFile::Ext2Mime() method. HtFile::Request() currently falls back
> on these when it can't open mime.types.
Greetings,
Below is the patch against 3.2.0b4-20020616. This includes the
hardcoded types, and bad_local_extensions to allow .php etc. not
to be parsed locally. If bad_local_extensions is explicitly set
empty, do you think it would be good to allow *all* files to be parsed
locally (even those with no extensions)? Of course, ones for which no
MIME type is known would have to be treated as text/plain but it
would be good if a site has a lot of text files with no extensions.
Also, would there be any demand to index compressed files? If someone
has a lot of .ps.gz files, for example, it could be useful to include
them in the index.
Finally, I think someon has been editing the files with a tab size other
than 8... Is there a policy on that?
Cheers,
Lachlan
*** htdig/Document.cc Sun Jan 13 19:13:13 2002
--- htdig/Document.cc.lha Mon Jun 24 01:06:48 2002
***************
*** 72,78 ****
FileConnect = 0;
NNTPConnect = 0;
externalConnect = 0;
! HtConfiguration* config= HtConfiguration::config();
// We probably need to move assignment of max_doc_size, according
// to a server or url configuration value. The same is valid for
--- 72,78 ----
FileConnect = 0;
NNTPConnect = 0;
externalConnect = 0;
! HtConfiguration* config= HtConfiguration::config();
// We probably need to move assignment of max_doc_size, according
// to a server or url configuration value. The same is valid for
***************
*** 549,555 ****
Transport::DocStatus
Document::RetrieveLocal(HtDateTime date, StringList *filenames)
{
! HtConfiguration* config= HtConfiguration::config();
struct stat stat_buf;
String *filename;
--- 549,555 ----
Transport::DocStatus
Document::RetrieveLocal(HtDateTime date, StringList *filenames)
{
! HtConfiguration* config= HtConfiguration::config();
struct stat stat_buf;
String *filename;
***************
*** 558,564 ****
// Loop through list of potential filenames until the list is exhausted
// or a suitable file is found to exist as a regular file.
while ((filename = (String *)filenames->Get_Next()) &&
! ((stat((char*)*filename, &stat_buf) == -1) || !S_ISREG(stat_buf.st_mode)))
if (debug > 1)
cout << " tried local file " << *filename << endl;
--- 558,564 ----
// Loop through list of potential filenames until the list is exhausted
// or a suitable file is found to exist as a regular file.
while ((filename = (String *)filenames->Get_Next()) &&
! ((stat((char*)*filename, &stat_buf) == -1) || !S_ISREG(stat_buf.st_mode)))
if (debug > 1)
cout << " tried local file " << *filename << endl;
***************
*** 572,593 ****
if (modtime <= date)
return Transport::Document_not_changed;
- // Process only HTML files (this could be changed if we read
- // the server's mime.types file).
- // (...and handle a select few other types for now... this should
- // eventually be handled by the "file://..." handler, which uses
- // mime.types to determine the file type.) -- FIXME!!
char *ext = strrchr((char*)*filename, '.');
if (ext == NULL)
return Transport::Document_not_local;
! if ((mystrcasecmp(ext, ".html") == 0) || (mystrcasecmp(ext, ".htm") == 0))
! contentType = "text/html";
! else if ((mystrcasecmp(ext, ".txt") == 0) || (mystrcasecmp(ext, ".asc") == 0))
! contentType = "text/plain";
! else if ((mystrcasecmp(ext, ".pdf") == 0))
! contentType = "application/pdf";
! else if ((mystrcasecmp(ext, ".ps") == 0) || (mystrcasecmp(ext, ".eps") == 0))
! contentType = "application/postscript";
else
return Transport::Document_not_local;
--- 572,585 ----
if (modtime <= date)
return Transport::Document_not_changed;
char *ext = strrchr((char*)*filename, '.');
+ if (ext && strchr(ext,'/')) // Ignore a dot if it's not in the
+ ext = NULL; // final component of the path.
if (ext == NULL)
return Transport::Document_not_local;
! const String *type = HtFile::Ext2Mime (ext + 1);
! if (type != NULL)
! contentType = *type;
else
return Transport::Document_not_local;
*** htnet/HtFile.h Mon Jun 24 01:02:42 2002
--- htnet/HtFile.h.lha Mon Jun 24 01:02:51 2002
***************
*** 64,69 ****
--- 64,73 ----
// manages a Transport request (method inherited from Transport class)
virtual DocStatus Request ();
+ // Determine Mime type of file
+ // (Does it belong here??)
+ static const String *Ext2Mime (const char *);
+
///////
// Interface for resource retrieving
///////
*** htnet/HtFile.cc Sun Dec 23 19:13:14 2001
--- htnet/HtFile.cc.lha Mon Jun 24 00:48:34 2002
***************
*** 76,96 ****
}
! ///////
! // Manages the requesting process
! ///////
!
! HtFile::DocStatus HtFile::Request()
{
- HtConfiguration* config= HtConfiguration::config();
static Dictionary *mime_map = 0;
if (!mime_map)
{
mime_map = new Dictionary();
ifstream in(config->Find("mime_types").get());
if (in)
{
String line;
while (in >> line)
{
--- 76,110 ----
}
! // Return mime type indicated by extension ext (which is assumed not
! // to contain the '.'), or NULL if ext is not a know mime type, or
! // is listed in bad_local_extensions.
! const String *HtFile::Ext2Mime (const char *ext)
{
static Dictionary *mime_map = 0;
if (!mime_map)
{
+ HtConfiguration* config= HtConfiguration::config();
mime_map = new Dictionary();
+ if (!mime_map)
+ return NULL;
+
+ if (debug > 2)
+ cout << "MIME types: " << config->Find("mime_types").get() << endl;
ifstream in(config->Find("mime_types").get());
if (in)
{
+ // Set up temporary dictionary of extensions not to parse locally
+ Dictionary bad_local_exts;
+ StringList split_exts(config->Find("bad_local_extensions"), "\t .");
+ for (int i = 0; i < split_exts.Count(); i++)
+ {
+ if (debug > 3)
+ cout << "Bad local extension: " << split_exts[i] << endl;
+ bad_local_exts.Add(split_exts[i], 0);
+ }
+
String line;
while (in >> line)
{
***************
*** 99,114 ****
if ((cmt = line.indexOf('#')) >= 0)
line = line.sub(0, cmt);
StringList split_line(line, "\t ");
! // Let's cache mime type to lesser the number of
! // operator [] callings
String mime_type = split_line[0];
// Fill map with values.
for (int i = 1; i < split_line.Count(); i++)
! mime_map->Add(split_line[i], new String(mime_type));
}
}
}
// Reset the response
_response.Reset();
--- 113,161 ----
if ((cmt = line.indexOf('#')) >= 0)
line = line.sub(0, cmt);
StringList split_line(line, "\t ");
! // cache mime type to lessen the number of operator [] callings
String mime_type = split_line[0];
// Fill map with values.
for (int i = 1; i < split_line.Count(); i++)
! {
! const char *ext = split_line [i];
! if (bad_local_exts.Exists(ext))
! {
! if (debug > 3)
! cout << "Bad local extension: " << ext << endl;
! continue;
! }
!
! if (debug > 3)
! cout << "MIME: " << ext << "\t-> " << mime_type << endl;
! mime_map->Add(ext, new String(mime_type));
! }
}
}
+ else
+ {
+ if (debug > 2)
+ cout << "MIME types file not found. Using default types.\n";
+ mime_map->Add(String("html"), new String("text/html"));
+ mime_map->Add(String("htm"), new String("text/html"));
+ mime_map->Add(String("txt"), new String("text/plain"));
+ mime_map->Add(String("asc"), new String("text/plain"));
+ mime_map->Add(String("pdf"), new String("application/pdf"));
+ mime_map->Add(String("ps"), new String("application/postscript"));
+ mime_map->Add(String("eps"), new String("application/postscript"));
+ }
}
+ // return MIME type, or NULL if not found
+ return (String *)mime_map->Find(ext);
+ }
+
+ ///////
+ // Manages the requesting process
+ ///////
+
+ HtFile::DocStatus HtFile::Request()
+ {
// Reset the response
_response.Reset();
***************
*** 166,191 ****
return Transport::Document_not_changed;
char *ext = strrchr(_url.path(), '.');
if (ext == NULL)
return Transport::Document_not_local;
! if (mime_map && mime_map->Count())
! {
! String *mime_type = (String *)mime_map->Find(ext + 1);
! if (mime_type)
! _response._content_type = *mime_type;
! else
! return Transport::Document_not_local;
! }
else
! {
! if ((mystrcasecmp(ext, ".html") == 0) || (mystrcasecmp(ext, ".htm") == 0))
! _response._content_type = "text/html";
! else if (mystrcasecmp(ext, ".txt") == 0)
! _response._content_type = "text/plain";
! else
! return Transport::Document_not_local;
! }
_response._modification_time = new HtDateTime(stat_buf.st_mtime);
--- 213,228 ----
return Transport::Document_not_changed;
char *ext = strrchr(_url.path(), '.');
+ if (ext && strchr(ext,'/')) // Ignore a dot if it's not in the
+ ext = NULL; // final component of the path.
if (ext == NULL)
return Transport::Document_not_local;
! const String *mime_type = Ext2Mime(ext + 1);
! if (mime_type)
! _response._content_type = *mime_type;
else
! return Transport::Document_not_local;
_response._modification_time = new HtDateTime(stat_buf.st_mtime);
*** htcommon/defaults.cc Sun Jun 23 23:55:41 2002
--- htcommon/defaults.cc.lha Mon Jun 24 01:01:09 2002
***************
*** 145,151 ****
documents as text while they are some binary format. \
If the list is empty, then all extensions are acceptable, \
provided they pass other criteria for acceptance or rejection. \
! See also <a href=\"#valid_extensions\">valid_extensions</a>. \
" }, \
{ "bad_querystr", "", \
"pattern list", "htdig", "URL", "3.1.0", "Indexing:Where", "bad_querystr: forum=private section=topsecret&passwd=required", " \
--- 145,165 ----
documents as text while they are some binary format. \
If the list is empty, then all extensions are acceptable, \
provided they pass other criteria for acceptance or rejection. \
! See also <a href=\"#valid_extensions\">valid_extensions</a> and \
! <a href=\"#bad_local_extensions\">bad_local_extensions</a>. \
! " }, \
! { "bad_local_extensions", ".php .shtml", \
! "string list", "htdig", "URL", "all", "Indexing:Where", "bad_local_extensions: .php .foo .bar", " \
! This is a list of extensions on URLs which are \
! considered active, that is, the content delivered by the web \
! server is not simply the text of the file, but is generated \
! on-the-fly. This list is used mainly to allow URLs on the local \
! machine to be read using the local filesystem, rather than \
! through HTTP. \
! If the list is empty, then all extensions are acceptable, \
! provided they pass other criteria for acceptance or rejection. \
! See also <a href=\"#valid_extensions\">valid_extensions</a> and \
! <a href=\"#bad_extensions\">bad_extensions</a>. \
" }, \
{ "bad_querystr", "", \
"pattern list", "htdig", "URL", "3.1.0", "Indexing:Where", "bad_querystr: forum=private section=topsecret&passwd=required", " \
--
Lachlan Andrew lh...@ee... Phone: +613 8344-3816 Fax: +613 8344-6678
Department of Electrical and Electronic Engineering CRICOS Provider Code
University of Melbourne, Victoria, 3010 AUSTRALIA 00116K
|