From: Geert J. <gj...@us...> - 2002-09-03 19:40:51
|
Update of /cvsroot/woc/woc/src/woc/cgi-src/search In directory usw-pr-cvs1:/tmp/cvs-serv23171/woc/cgi-src/search Added Files: fuzzy.pl realsearch.pl sa.pl Log Message: --- NEW FILE: fuzzy.pl --- #!/usr/local/bin/perl -w push (@INC, "."); use LWP; use CGI qw(:standard); use WWW::Search; use String::Approx 'amatch'; # get the parameters from the stdin # my $zoekarg = param("query"); my $zoekorg = $zoekarg; my $text = param("text"); # aanpassen zoekarg my $pre_asterix = ""; my $pre_quest = ""; if ($zoekorg =~ s/^\*//) { $pre_asterix = "yes"; } if ($zoekorg =~ s/^\?//) { $pre_quest = "yes"; } $zoekarg =~ s/\./\\\./g; $zoekarg =~ s/\?/\./g; $zoekarg =~ s/\*/\(\\\w\|\\\-\|\\\(\|\\\)\)\+\?/g; # initialize dirs and files # my $foundfile = "arguments_found.txt"; my $notfoundfile = "arguments_notfound.txt"; my $root = "/vol/www/woc/web-docs"; my $datadir = "/vol/www/woc/data/wml"; my $destdir = "gui/items"; my $desturl = "http://www-woc.sci.kun.nl"; my $url = ""; my @files = <$datadir/*.xml>; my @foundfiles = (); # Mime-Type header line # needs to be placed elsewhere (bug #11) # print "Content-type: text/html\n\n"; #debug #print "-$zoekorg- -> -$zoekarg-\n"; if ($zoekarg eq "") { if ($text eq "") { print "Content-type: text/html\n\n"; open (FILE, "<$root/gui/search.html"); while (<FILE>) { print; }; close(FILE); } elsif ($text ne "") { #print "Text found!"; print "Content-type: text/html\n\n"; open (FILE, "<$root/gui/search.html"); while (<FILE>) { # note that \s+ was substituted by \s+, but that doesn't work... "\ " does! s/NAME="query"\s+VALUE=\"\"/NAME="query"\ VALUE=\"$text\"/; print; } close(FILE); } } else { $hit = ""; foreach $file (@files) { $file =~ s/.*?(\w*).xml/$1/; open (OPENFILE, "<$datadir/$file.xml") || next; #print "$file opened\n"; my $found = 0; my $titel = ""; my $code = ""; my $line = ""; while (defined ($line = <OPENFILE>) && !$found) { # prevent from crashing due to match in the first or second line (bug #29) if ($line =~ /<\?xml.*\?>/) { # skip line } elsif ($line =~ /DOCTYPE/) { # skip line } else { if ($line =~ /<ITEM/i) { if ($line =~ /NAME=\"(.*?)\"/i) { $title = $1; } if ($line =~ /CODE=\"(.*?)\"/i) { $code = $1; } } $_ = $line; #print "\$\_: $_"; if (amatch($zoekarg, ['i', 'S2']) && !$found) { print "fuzzy match: $title -> $_"; # match fuzzy if ( ! (-f "$root/$destdir/$file.shtml") ) { $file = $code; } if ( -f "$root/$destdir/$file.shtml" ) { push (@foundfiles, "$root/$destdir/$file.shtml"); $hit = "$root/$destdir/$file.shtml"; $url = "$desturl/$destdir/$file.shtml"; $htmlcode = "<dd><a href=\"$url\">$title</a> \n"; $yield = 50; #print "length: " . length($title) . "\n"; $yield = int((length $zoekorg) / (length($title)) * 50) if (length $title > 0); $_ = $title; if ( $title =~ /$zoekorg/i ) { $yield = $yield + 50; } $htmlcode .= "$yield\%<br>\n"; $key = $yield; if ($key < 100) { if ($key < 10) { $key = "0$key"; } $key = "0$key"; } if ($results{$key}) { while ($results{$key}) { $key = "$key.1"; } $results{$key} = $htmlcode; } else { $results{$key} = $htmlcode; } } $found = 1; } } } close (OPENFILE); } my $i = @foundfiles; #$print "tijdelijke debug info: $i\n"; if (@foundfiles == 1) { print "Location: $url\n"; print "Content-type: text/html\n\n"; open (FOUND, "$hit"); while (<FOUND>) { print; }; close (FOUND); } else { print "Content-type: text/html\n\n"; print "<html>\n"; print "<head>\n"; print " <title>Woordenboek Organische Chemie</title>\n"; print " <base href=\"http://www-woc.sci.kun.nl/\">\n"; print " <link rel=\"stylesheet\" href=\"gui/styles/woc.css\" type=\"text/css\">\n"; print " <script src=\"gui/javascript/isframe.js\"></script>\n"; print " <script src=\"gui/javascript/location.js\"></script>\n"; print " <script>\n"; print " <!--\n"; print " setLocation(\"Zoekresultaten\");\n"; print " // -->\n"; print " </script>\n"; print "</head>\n"; print "<body class=\"main\">\n"; print " <h1>Zoekresultaten (met fuzzy zoeken)</h1>\n<hr>\n"; print " Zoekargument is "; if ($pre_asterix) { print "*"; } elsif ($pre_quest) { print "?"; } print "$zoekorg.\n\n"; if (@foundfiles) { @keyz = reverse sort keys %results; foreach $key (@keyz) { # $intval = int $key; print "$results{$key}\n"; } if (open ARGFILE, ">>$foundfile") { print ARGFILE "$zoekarg\n"; close ARGFILE; } } else { print "\n<dd>Er is geen informatie gevonden.<br>\n"; print "<dd>U kunt vragen stellen \n"; print "via <a href=\"mailto:woc\@sci.kun.nl\">email</a> aan de WOC-redactie.\n"; if (open ARGFILE, ">>$notfoundfile") { print ARGFILE "$zoekarg\n"; close ARGFILE; } } print "</body>\n"; print "</html>\n"; } } --- NEW FILE: realsearch.pl --- #!/usr/local/bin/perl -w use diagnostics; use LWP; use XML::Parser; require WWW::Search; use CGI qw(:standard); my $script_title = "netsearch.pl"; my $searchengine = "AltaVista"; my $zoekarg = param("query"); my $lang = param("lang"); my @synoniemen = (); unless ($zoekarg) { $zoekarg = "azijnzuur"; }; unless ($lang) { $lang = "EN"; }; if ($zoekarg =~ /^WOC/) { # it is NT! it is a WOC number! my $line = `grep "$zoekarg" /vol/www/woc/data/wml/*.xml`; if ($line =~ /^.*\/(.*?)\.xml.*?NAME=\"(.*?)\">/) { $zoekarg = $1; } } my $fileurl = "file:///vol/www/woc/data/wml/$zoekarg.xml"; print "Content-type: text/html\n\n"; sub handle_start { my ($p, $data) = @_; $charstring = ""; $thisone = 0; if ($data eq "LANG") { shift; shift; while (@_) { my $att = shift; my $value = shift; if ($att eq "ID") { if ($value eq $lang) { $thisone = 1; } } } } } sub handle_end { my ($p, $data) = @_; if ($data eq "LANG") { if ($thisone) { push @synoniemen, $charstring; #print "$charstring\n"; } } } sub handle_char { my ($p, $data) = @_; $charstring = "$charstring$data"; } print "<html>\n"; print "<head>\n"; print " <title>Woordenboek Organische Chemie</title>\n"; print " <base href=\"http://www-woc.sci.kun.nl/\">\n"; print " <link rel=\"stylesheet\" href=\"gui/styles/woc.css\" type=\"text/css\">\n"; print " <script src=\"gui/javascript/isframe.js\"></script>\n"; print " <script src=\"gui/javascript/location.js\"></script>\n"; print " <script>\n"; print " <!--\n"; print " setLocation(\"Internet Zoekmachine\");\n"; print " // -->\n"; print " </script>\n"; print "</head>\n"; print "<body class=\"main\">\n"; print " <h1>Zoekresultaten Internet</h1>\n<hr>\n"; #maken lijst met synoniemen @synoniemen = (); my $browser = LWP::UserAgent->new(); $browser->agent($script_title); my $request = HTTP::Request->new(GET => $fileurl); my $foo = $browser->request($request); if ($foo->is_success) { $superxmlfile = $foo->content; } else { print " <ul>\n"; print " Er is onvoldoende informatie beschikbaar. Deze functie werkt\n"; print " voor dit woord niet. Er is hiervan melding gemaakt bij de\n"; print " redactie.\n"; print " </ul>\n"; print "</body>\n"; print "</html>\n"; open ARGFILE, ">>realsearch_errors.txt"; print ARGFILE "$zoekarg\n"; close ARGFILE; exit(1); } # Start parsing super.xml my $p1 = new XML::Parser(Handlers => {Start => \&handle_start, End => \&handle_end, Char => \&handle_char}); $p1->parse($superxmlfile); # maken zoekstring $aantal = scalar(@synoniemen); if ($aantal > 1) { $i = 1; $zoekstring = ""; while ($i <= $aantal) { if ($i > 1) { $zoekstring .= " OR "; }; if ($zoekstring) { $zoekstring .= "\"".$synoniemen[$i-1]."\""; } else { $zoekstring = "\"".$synoniemen[$i-1]."\""; } $i++; } } else { $synon = $synoniemen[0]; $zoekstring = "\"$synon\""; } print " Zoekargument is $zoekstring.\n\n"; my($search) = new WWW::Search($searchengine); $search->native_query(WWW::Search::escape_query($zoekstring)); $max = $search->maximum_to_retrieve(10); $max = 0; @results = $search->results(); print " <ul>\n"; if (scalar(@results) == 0) { print "Er is helaas geen informatie gevonden.\n"; } else { $i = 0; while ($i < 10) { $result = shift @results; if ($result) { $link = $result->url; print "<b>",++$i,"</b> <a href=\"",$link,"\">",$link,"</a><br>\n"; $lengte = ($result->size / 1024); $ua = new LWP::UserAgent; $ua->agent("WOC-SearchEngine/0.1 "); my $req = new HTTP::Request POST => $link; my $res = $ua->request($req); if ($res->is_success) { $doc = $res->content; $lengte = int ( (length($doc) / 1024 ) * 10) / 10; } print "<dd>lengte: $lengte kb<br>\n"; }; }; }; print " </ul>\n"; print "</body>\n"; print "</html>\n"; open ARGFILE, ">>realsearch_searched.txt"; print ARGFILE "$zoekarg\n"; close ARGFILE; --- NEW FILE: sa.pl --- #!/usr/local/bin/perl -w use strict; use LWP; use CGI qw(:standard); # get the parameters from the stdin # my $zoekarg = param("query"); my $zoekorg = $zoekarg; my $text = param("text"); my $CAS = param("CAS") || ""; # aanpassen zoekarg my $pre_asterix = ""; my $pre_quest = ""; if ($zoekorg =~ s/^\*//) { $pre_asterix = "yes"; } if ($zoekorg =~ s/^\?//) { $pre_quest = "yes"; } $zoekarg =~ s/\-/\\\-/g; $zoekarg =~ s/\./\\\./g; $zoekarg =~ s/\?/\./g; $zoekarg =~ s/\*/\(\\\w\|\\\-\|\\\(\|\\\)\)\+\?/g; # initialize dirs and files # my $foundfile = "arguments_found.txt"; my $notfoundfile = "arguments_notfound.txt"; my $root = "/vol/www/woc/web-docs"; my $datadir = "/vol/www/woc/data/wml"; my $destdir = "gui/items"; my $desturl = "http://www-woc.sci.kun.nl/"; my $url = ""; my @files = <$datadir/*.xml>; my @foundfiles = (); # Mime-Type header line # needs to be placed elsewhere (bug #11) # print "Content-type: text/html\n\n"; #debug #print "-$zoekorg- -> -$zoekarg-\n"; #print "-$CAS-\n"; if ($zoekarg eq "") { if ($text eq "") { print "Content-type: text/html\n\n"; open (FILE, "<$root/gui/search.html"); while (<FILE>) { print; }; close(FILE); } elsif ($text ne "") { #print "Text found!"; print "Content-type: text/html\n\n"; open (FILE, "<$root/gui/search.html"); while (<FILE>) { # note that \s+ was substituted by \s+, but that doesn't work... "\ " does! s/NAME="query"\s+VALUE=\"\"/NAME="query"\ VALUE=\"$text\"/; print; } close(FILE); } } else { my $hit = ""; my $url = ""; my %results; foreach my $file (@files) { $file =~ s/.*?(\w*).xml/$1/; open (OPENFILE, "<$datadir/$file.xml") || next; #print "$file opened\n"; my $found = 0; my $title = ""; my $code = ""; while ( <OPENFILE> ) { # prevent from crashing due to match in the first or second line (bug #29) if (m/<\?xml.*\?>/) { # skip line } elsif (m/DOCTYPE/) { # skip line } else { if (/<ITEM/i) { if (/NAME=\"(.*?)\"/i) { $title = $1; } if (/CODE=\"(.*?)\"/i) { $code = $1; } } elsif ( ( ( ($CAS ne "yes") && /$zoekarg/i) || ( ($CAS eq "yes") && /CAS-NUMBER.*$zoekarg/i) ) && !$found) { if ( ! (-r "$root/$destdir/$file.shtml") ) { $file = $code; } if ( -r "$root/$destdir/$file.shtml" ) { push (@foundfiles, "$root/$destdir/$file.shtml"); $hit = "$root/$destdir/$file.shtml"; $url = "$desturl/$destdir/$file.shtml"; my $htmlcode = "<dd><a href=\"$url\">$title</a> \n"; my $yield = 50; #print "length: " . length($title) . "\n"; $yield = int((length $zoekorg) / (length($title)) * 50) if (length $title > 0); $_ = $title; if ( $title =~ /$zoekorg/i ) { $yield = $yield + 50; } $htmlcode .= "$yield\%<br>\n"; my $key = $yield; if ($key < 100) { if ($key < 10) { $key = "0$key"; } $key = "0$key"; } if ($results{$key}) { while ($results{$key}) { $key = "$key.1"; } $results{$key} = $htmlcode; } else { $results{$key} = $htmlcode; } } $found = 1; } } } close (OPENFILE); } #my $i = $foundfiles; #$print "tijdelijke debug info: $i\n"; if (scalar(@foundfiles) == 1) { print "Location: $url\n"; print "Content-type: text/html\n\n"; open (FOUND, "$hit"); while (<FOUND>) { print; }; close (FOUND); } elsif (scalar(@foundfiles) > 1) { print "Content-type: text/html\n\n"; print "<html>\n"; print "<head>\n"; print " <title>Woordenboek Organische Chemie</title>\n"; print " <base href=\"http://www-woc.sci.kun.nl/\">\n"; print " <link rel=\"stylesheet\" href=\"gui/styles/woc.css\" type=\"text/css\">\n"; print " <script src=\"gui/javascript/isframe.js\"></script>\n"; print " <script src=\"gui/javascript/location.js\"></script>\n"; print " <script>\n"; print " <!--\n"; print " setLocation(\"Zoekresultaten\");\n"; print " // -->\n"; print " </script>\n"; print "</head>\n"; print "<body class=\"main\">\n"; print " <h1>Zoekresultaten</h1>\n<hr>\n"; if (@foundfiles) { print " Zoekargument is "; if ($pre_asterix) { print "*"; } elsif ($pre_quest) { print "?"; } print "$zoekorg.\n\n"; my @keyz = reverse sort keys %results; foreach my $key (@keyz) { # $intval = int $key; print "$results{$key}\n"; } } print "</body>\n"; print "</html>\n"; } else { $zoekorg =~ s/\*//g; $zoekorg =~ s/\?//g; # Omdat de fuzzy search functionaliteit te veel vraagt van de webserver # wordt deze off line gebracht (dd. 2001-01-28) # Inmiddels kan door het in ongebruik raken van het dadml.pl script, kan # de fuzzy zoekmachine weer gebruikt worden. (dd 2001-02-24) #open ARGFILE, ">>$notfoundfile"; #print ARGFILE "$zoekorg\n"; #close ARGFILE; if ($CAS ne "yes") { print "Location: http://www-woc.sci.kun.nl/cgi-bin/search/fuzzy.pl?query=$zoekorg\n"; print "Content-type: text/html\n\n"; } else { # de onderstaande code kwam voor de fuzzy search in de plaats print "Content-type: text/html\n\n"; print "<html>\n"; print "<head>\n"; print " <title>Woordenboek Organische Chemie</title>\n"; print " <base href=\"http://www-woc.sci.kun.nl/\">\n"; print " <link rel=\"stylesheet\" href=\"gui/styles/woc.css\" type=\"text/css\">\n"; print " <script src=\"gui/javascript/isframe.js\"></script>\n"; print " <script src=\"gui/javascript/location.js\"></script>\n"; print " <script>\n"; print " <!--\n"; print " setLocation(\"Zoekresultaten\");\n"; print " // -->\n"; print " </script>\n"; print "</head>\n"; print "<body class=\"main\">\n"; print " <h1>Zoekresultaten</h1>\n<hr>\n"; print " Helaas zijn er geen resultaten gevonden. Pas uw\n"; print " zoekvraag aan, of <a href=\"mailto:woc\@sci.kun.nl\">stuur een email</a>.\n"; print "</body>\n"; print "</html>\n"; if (open ARGFILE, ">>$notfoundfile") { print ARGFILE "$zoekorg\n"; close ARGFILE; } } } } |