From: <bi...@us...> - 2008-12-11 22:58:33
|
Revision: 2660 http://archive-access.svn.sourceforge.net/archive-access/?rev=2660&view=rev Author: binzino Date: 2008-12-11 22:58:28 +0000 (Thu, 11 Dec 2008) Log Message: ----------- Initial checkin of Nutch source-files that are over-ridden and copied into the Nutch source tree when compiling. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/nutch/ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/FetchedSegments.java trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/OpenSearchServlet.java Added: trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/FetchedSegments.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/FetchedSegments.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/FetchedSegments.java 2008-12-11 22:58:28 UTC (rev 2660) @@ -0,0 +1,375 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.searcher; + +import java.io.IOException; +import java.io.Reader; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.BufferedReader; + +import java.util.HashMap; +import java.util.Map; +import java.util.Iterator; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import org.apache.commons.lang.StringUtils; +import org.apache.hadoop.io.*; +import org.apache.hadoop.fs.*; +import org.apache.nutch.protocol.*; +import org.apache.nutch.parse.*; +import org.apache.nutch.util.HadoopFSUtil; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.mapred.*; +import org.apache.hadoop.mapred.lib.*; +import org.apache.nutch.crawl.*; + +/** Implements {@link HitSummarizer} and {@link HitContent} for a set of + * fetched segments. */ +public class FetchedSegments implements HitSummarizer, HitContent +{ + public static final Log LOG = LogFactory.getLog(FetchedSegments.class); + + private static class Segment implements Closeable { + + private static final Partitioner PARTITIONER = new HashPartitioner(); + + private FileSystem fs; + private Path segmentDir; + + private MapFile.Reader[] content; + private MapFile.Reader[] parseText; + private MapFile.Reader[] parseData; + private MapFile.Reader[] crawl; + private Configuration conf; + + public Segment(FileSystem fs, Path segmentDir, Configuration conf) throws IOException { + this.fs = fs; + this.segmentDir = segmentDir; + this.conf = conf; + } + + public CrawlDatum getCrawlDatum(Text url) throws IOException { + synchronized (this) { + if (crawl == null) + crawl = getReaders(CrawlDatum.FETCH_DIR_NAME); + } + return (CrawlDatum)getEntry(crawl, url, new CrawlDatum()); + } + + public byte[] getContent(Text url) throws IOException { + synchronized (this) { + if (content == null) + content = getReaders(Content.DIR_NAME); + } + return ((Content)getEntry(content, url, new Content())).getContent(); + } + + public ParseData getParseData(Text url) throws IOException { + synchronized (this) { + if (parseData == null) + parseData = getReaders(ParseData.DIR_NAME); + } + return (ParseData)getEntry(parseData, url, new ParseData()); + } + + public ParseText getParseText(Text url) throws IOException { + synchronized (this) { + if (parseText == null) + parseText = getReaders(ParseText.DIR_NAME); + } + return (ParseText)getEntry(parseText, url, new ParseText()); + } + + private MapFile.Reader[] getReaders(String subDir) throws IOException { + return MapFileOutputFormat.getReaders(fs, new Path(segmentDir, subDir), this.conf); + } + + private Writable getEntry(MapFile.Reader[] readers, Text url, + Writable entry) throws IOException { + return MapFileOutputFormat.getEntry(readers, PARTITIONER, url, entry); + } + + public void close() throws IOException { + if (content != null) { closeReaders(content); } + if (parseText != null) { closeReaders(parseText); } + if (parseData != null) { closeReaders(parseData); } + if (crawl != null) { closeReaders(crawl); } + } + + private void closeReaders(MapFile.Reader[] readers) throws IOException { + for (int i = 0; i < readers.length; i++) { + readers[i].close(); + } + } + + } + + private HashMap segments = new HashMap( ); + private boolean perCollection = false; + private Summarizer summarizer; + + /** Construct given a directory containing fetcher output. */ + public FetchedSegments(FileSystem fs, String segmentsDir, Configuration conf) throws IOException + { + this.summarizer = new SummarizerFactory(conf).getSummarizer(); + + Path[] segmentDirs = HadoopFSUtil.getPaths( fs.listStatus(new Path(segmentsDir), HadoopFSUtil.getPassDirectoriesFilter(fs)) ); + if ( segmentDirs == null ) + { + LOG.warn( "No segment directories: " + segmentsDir ); + return ; + } + + this.perCollection = conf.getBoolean( "nutchwax.FetchedSegments.perCollection", false ); + + LOG.info( "Per-collection segments: " + this.perCollection ); + + for ( int i = 0; i < segmentDirs.length; i++ ) + { + if ( this.perCollection ) + { + // Assume segmentDir is actually a 'collection' dir which + // contains a list of segments, such as: + // crawl/segments/194/segment-foo + // /segment-bar + // /segment-baz + // crawl/segments/366/segment-frotz + // /segment-fizzle + // /segment-bizzle + // The '194' and '366' are collection dirs, which contain the + // actual segment dirs. + Path collectionDir = segmentDirs[i]; + + Map perCollectionSegments = (Map) this.segments.get( collectionDir.getName( ) ); + if ( perCollectionSegments == null ) + { + perCollectionSegments = new HashMap( ); + this.segments.put( collectionDir.getName( ), perCollectionSegments ); + } + + // Now, get a list of all the sub-dirs of the collectionDir, + // and create segments for them, adding them to the + // per-collection map. + Path[] perCollectionSegmentDirs = HadoopFSUtil.getPaths( fs.listStatus( collectionDir, HadoopFSUtil.getPassDirectoriesFilter(fs) ) ); + for ( Path segmentDir : perCollectionSegmentDirs ) + { + perCollectionSegments.put( segmentDir.getName( ), new Segment( fs, segmentDir, conf ) ); + } + + addRemaps( fs, collectionDir, (Map<String,Segment>) perCollectionSegments ); + } + else + { + Path segmentDir = segmentDirs[i]; + segments.put(segmentDir.getName(), new Segment(fs, segmentDir, conf)); + } + } + + // If we not-doing perCollection segments, process a single + // "remap" file for the "segments" dir. + if ( ! this.perCollection ) + { + addRemaps( fs, new Path(segmentsDir), (Map<String,Segment>) segments ); + } + + LOG.info( "segments: " + segments ); + } + + protected void addRemaps( FileSystem fs, Path segmentDir, Map<String,Segment> segments ) + throws IOException + { + Path segmentRemapFile = new Path( segmentDir, "remap" ); + + if ( ! fs.exists( segmentRemapFile ) ) + { + LOG.warn( "Remap file doesn't exist: " + segmentRemapFile ); + + return ; + } + + // InputStream is = segmentRemapFile.getFileSystem( conf ).open( segmentRemapFile ); + InputStream is = fs.open( segmentRemapFile ); + + BufferedReader reader = new BufferedReader( new InputStreamReader( is, "UTF-8" ) ); + + String line; + while ( (line = reader.readLine()) != null ) + { + String fields[] = line.trim( ).split( "\\s+" ); + + if ( fields.length < 2 ) + { + LOG.warn( "Malformed remap line, not enough fields ("+fields.length+"): " + line ); + continue ; + } + + // Look for the "to" name in the segments. + Segment toSegment = segments.get( fields[1] ); + if ( toSegment == null ) + { + LOG.warn( "Segment remap destination doesn't exist: " + fields[1] ); + } + else + { + LOG.warn( "Remap: " + fields[0] + " => " + fields[1] ); + segments.put( fields[0], toSegment ); + } + } + } + + + public String[] getSegmentNames() { + return (String[])segments.keySet().toArray(new String[segments.size()]); + } + + public byte[] getContent(HitDetails details) throws IOException { + return getSegment(details).getContent(getUrl(details)); + } + + public ParseData getParseData(HitDetails details) throws IOException { + return getSegment(details).getParseData(getUrl(details)); + } + + public long getFetchDate(HitDetails details) throws IOException { + return getSegment(details).getCrawlDatum(getUrl(details)) + .getFetchTime(); + } + + public ParseText getParseText(HitDetails details) throws IOException { + return getSegment(details).getParseText(getUrl(details)); + } + + public Summary getSummary(HitDetails details, Query query) + throws IOException { + + if (this.summarizer == null) { return new Summary(); } + + Segment segment = getSegment(details); + ParseText parseText = segment.getParseText(getUrl(details)); + String text = (parseText != null) ? parseText.getText() : ""; + + return this.summarizer.getSummary(text, query); + } + + private class SummaryThread extends Thread { + private HitDetails details; + private Query query; + + private Summary summary; + private Throwable throwable; + + public SummaryThread(HitDetails details, Query query) { + this.details = details; + this.query = query; + } + + public void run() { + try { + this.summary = getSummary(details, query); + } catch (Throwable throwable) { + this.throwable = throwable; + } + } + + } + + + public Summary[] getSummary(HitDetails[] details, Query query) + throws IOException { + SummaryThread[] threads = new SummaryThread[details.length]; + for (int i = 0; i < threads.length; i++) { + threads[i] = new SummaryThread(details[i], query); + threads[i].start(); + } + + Summary[] results = new Summary[details.length]; + for (int i = 0; i < threads.length; i++) { + try { + threads[i].join(); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + if (threads[i].throwable instanceof IOException) { + throw (IOException)threads[i].throwable; + } else if (threads[i].throwable != null) { + throw new RuntimeException(threads[i].throwable); + } + results[i] = threads[i].summary; + } + return results; + } + + + private Segment getSegment(HitDetails details) + { + if ( this.perCollection ) + { + LOG.info( "getSegment: " + details ); + LOG.info( " collection: " + details.getValue("collection") ); + LOG.info( " segment : " + details.getValue("segment") ); + + String collectionId = details.getValue("collection"); + String segmentName = details.getValue("segment"); + + Map perCollectionSegments = (Map) this.segments.get( collectionId ); + + Segment segment = (Segment) perCollectionSegments.get( segmentName ); + + if ( segment == null ) + { + LOG.warn( "Didn't find segment: collection=" + collectionId + " segment=" + segmentName ); + } + + return segment; + } + else + { + LOG.info( "getSegment: " + details ); + LOG.info( " segment : " + details.getValue("segment") ); + + String segmentName = details.getValue( "segment" ); + Segment segment = (Segment) segments.get( segmentName ); + + if ( segment == null ) + { + LOG.warn( "Didn't find segment: " + segmentName ); + } + + return segment; + } + } + + private Text getUrl(HitDetails details) { + String url = details.getValue("orig"); + if (StringUtils.isBlank(url)) { + url = details.getValue("url"); + } + return new Text(url); + } + + public void close() throws IOException { + Iterator iterator = segments.values().iterator(); + while (iterator.hasNext()) { + ((Segment) iterator.next()).close(); + } + } + +} Added: trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/IndexSearcher.java 2008-12-11 22:58:28 UTC (rev 2660) @@ -0,0 +1,179 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.searcher; + +import java.io.File; +import java.io.IOException; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.FloatWritable; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.WritableComparable; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.MultiReader; +import org.apache.lucene.search.FieldCache; +import org.apache.lucene.search.FieldDoc; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.nutch.indexer.FsDirectory; +import org.apache.nutch.indexer.NutchSimilarity; + +/** Implements {@link Searcher} and {@link HitDetailer} for either a single + * merged index, or a set of indexes. */ +public class IndexSearcher implements Searcher, HitDetailer { + + private org.apache.lucene.search.Searcher luceneSearcher; + private org.apache.lucene.index.IndexReader reader; + private LuceneQueryOptimizer optimizer; + private FileSystem fs; + private Configuration conf; + private QueryFilters queryFilters; + + /** Construct given a number of indexes. */ + public IndexSearcher(Path[] indexDirs, Configuration conf) throws IOException { + IndexReader[] readers = new IndexReader[indexDirs.length]; + this.conf = conf; + this.fs = FileSystem.get(conf); + for (int i = 0; i < indexDirs.length; i++) { + readers[i] = IndexReader.open(getDirectory(indexDirs[i])); + } + init(new MultiReader(readers), conf); + } + + /** Construct given a single merged index. */ + public IndexSearcher(Path index, Configuration conf) + throws IOException { + this.conf = conf; + this.fs = FileSystem.get(conf); + init(IndexReader.open(getDirectory(index)), conf); + } + + private void init(IndexReader reader, Configuration conf) throws IOException { + this.reader = reader; + this.luceneSearcher = new org.apache.lucene.search.IndexSearcher(reader); + this.luceneSearcher.setSimilarity(new NutchSimilarity()); + this.optimizer = new LuceneQueryOptimizer(conf); + this.queryFilters = new QueryFilters(conf); + } + + private Directory getDirectory(Path file) throws IOException { + if ("file".equals(this.fs.getUri().getScheme())) { + Path qualified = file.makeQualified(FileSystem.getLocal(conf)); + File fsLocal = new File(qualified.toUri()); + return FSDirectory.getDirectory(fsLocal.getAbsolutePath()); + } else { + return new FsDirectory(this.fs, file, false, this.conf); + } + } + + public Hits search(Query query, int numHits, + String dedupField, String sortField, boolean reverse) + + throws IOException { + org.apache.lucene.search.BooleanQuery luceneQuery = + this.queryFilters.filter(query); + + System.out.println( "Nutch query: " + query ); + System.out.println( "Lucene query: " + luceneQuery ); + + return translateHits + (optimizer.optimize(luceneQuery, luceneSearcher, numHits, + sortField, reverse), + dedupField, sortField); + } + + public String getExplanation(Query query, Hit hit) throws IOException { + return luceneSearcher.explain(this.queryFilters.filter(query), + hit.getIndexDocNo()).toHtml(); + } + + public HitDetails getDetails(Hit hit) throws IOException { + + Document doc = luceneSearcher.doc(hit.getIndexDocNo()); + + List docFields = doc.getFields(); + String[] fields = new String[docFields.size()]; + String[] values = new String[docFields.size()]; + for (int i = 0; i < docFields.size(); i++) { + Field field = (Field)docFields.get(i); + fields[i] = field.name(); + values[i] = field.stringValue(); + } + + return new HitDetails(fields, values); + } + + public HitDetails[] getDetails(Hit[] hits) throws IOException { + HitDetails[] results = new HitDetails[hits.length]; + for (int i = 0; i < hits.length; i++) + results[i] = getDetails(hits[i]); + return results; + } + + private Hits translateHits(TopDocs topDocs, + String dedupField, String sortField) + throws IOException { + + String[] dedupValues = null; + if (dedupField != null) + dedupValues = FieldCache.DEFAULT.getStrings(reader, dedupField); + + ScoreDoc[] scoreDocs = topDocs.scoreDocs; + int length = scoreDocs.length; + Hit[] hits = new Hit[length]; + for (int i = 0; i < length; i++) { + + int doc = scoreDocs[i].doc; + + WritableComparable sortValue; // convert value to writable + if (sortField == null) { + sortValue = new FloatWritable(scoreDocs[i].score); + } else { + Object raw = ((FieldDoc)scoreDocs[i]).fields[0]; + if (raw instanceof Integer) { + sortValue = new IntWritable(((Integer)raw).intValue()); + } else if (raw instanceof Float) { + sortValue = new FloatWritable(((Float)raw).floatValue()); + } else if (raw instanceof String) { + sortValue = new Text((String)raw); + } else { + throw new RuntimeException("Unknown sort value type!"); + } + } + + String dedupValue = dedupValues == null ? null : dedupValues[doc]; + + hits[i] = new Hit(doc, sortValue, dedupValue); + } + return new Hits(topDocs.totalHits, hits); + } + + public void close() throws IOException { + if (luceneSearcher != null) { luceneSearcher.close(); } + if (reader != null) { reader.close(); } + } + +} Added: trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/OpenSearchServlet.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/OpenSearchServlet.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/nutch/src/java/org/apache/nutch/searcher/OpenSearchServlet.java 2008-12-11 22:58:28 UTC (rev 2660) @@ -0,0 +1,333 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.searcher; + +import java.io.IOException; +import java.net.URLEncoder; +import java.util.Map; +import java.util.HashMap; +import java.util.Set; +import java.util.HashSet; + +import javax.servlet.ServletException; +import javax.servlet.ServletConfig; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import javax.xml.parsers.*; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; +import org.w3c.dom.*; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.Transformer; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; + + +/** Present search results using A9's OpenSearch extensions to RSS, plus a few + * Nutch-specific extensions. */ +public class OpenSearchServlet extends HttpServlet { + private static final Map NS_MAP = new HashMap(); + private int MAX_HITS_PER_PAGE; + + static { + NS_MAP.put("opensearch", "http://a9.com/-/spec/opensearchrss/1.0/"); + NS_MAP.put("nutch", "http://www.nutch.org/opensearchrss/1.0/"); + } + + private static final Set SKIP_DETAILS = new HashSet(); + static { + SKIP_DETAILS.add("url"); // redundant with RSS link + SKIP_DETAILS.add("title"); // redundant with RSS title + } + + private NutchBean bean; + private Configuration conf; + + public void init(ServletConfig config) throws ServletException { + try { + this.conf = NutchConfiguration.get(config.getServletContext()); + bean = NutchBean.get(config.getServletContext(), this.conf); + } catch (IOException e) { + throw new ServletException(e); + } + MAX_HITS_PER_PAGE = conf.getInt("searcher.max.hits.per.page", -1); + } + + public void doGet(HttpServletRequest request, HttpServletResponse response) + throws ServletException, IOException { + + if (NutchBean.LOG.isInfoEnabled()) { + NutchBean.LOG.info("query request from " + request.getRemoteAddr()); + } + + // get parameters from request + request.setCharacterEncoding("UTF-8"); + String queryString = request.getParameter("query"); + if (queryString == null) + queryString = ""; + String urlQuery = URLEncoder.encode(queryString, "UTF-8"); + + // the query language + String queryLang = request.getParameter("lang"); + + int start = 0; // first hit to display + String startString = request.getParameter("start"); + if (startString != null) + start = Integer.parseInt(startString); + + int hitsPerPage = 10; // number of hits to display + String hitsString = request.getParameter("hitsPerPage"); + if (hitsString != null) + hitsPerPage = Integer.parseInt(hitsString); + if(MAX_HITS_PER_PAGE > 0 && hitsPerPage > MAX_HITS_PER_PAGE) + hitsPerPage = MAX_HITS_PER_PAGE; + + String sort = request.getParameter("sort"); + boolean reverse = + sort!=null && "true".equals(request.getParameter("reverse")); + + // De-Duplicate handling. Look for duplicates field and for how many + // duplicates per results to return. Default duplicates field is 'site' + // and duplicates per results default is '2'. + String dedupField = request.getParameter("dedupField"); + if (dedupField == null || dedupField.length() == 0) { + dedupField = "site"; + } + int hitsPerDup = 2; + String hitsPerDupString = request.getParameter("hitsPerDup"); + if (hitsPerDupString != null && hitsPerDupString.length() > 0) { + hitsPerDup = Integer.parseInt(hitsPerDupString); + } else { + // If 'hitsPerSite' present, use that value. + String hitsPerSiteString = request.getParameter("hitsPerSite"); + if (hitsPerSiteString != null && hitsPerSiteString.length() > 0) { + hitsPerDup = Integer.parseInt(hitsPerSiteString); + } + } + + // Make up query string for use later drawing the 'rss' logo. + String params = "&hitsPerPage=" + hitsPerPage + + (queryLang == null ? "" : "&lang=" + queryLang) + + (sort == null ? "" : "&sort=" + sort + (reverse? "&reverse=true": "") + + (dedupField == null ? "" : "&dedupField=" + dedupField)); + + Query query = Query.parse(queryString, queryLang, this.conf); + if (NutchBean.LOG.isInfoEnabled()) { + NutchBean.LOG.info("query: " + queryString); + NutchBean.LOG.info("lang: " + queryLang); + } + + // execute the query + Hits hits; + try { + hits = bean.search(query, start + hitsPerPage, hitsPerDup, dedupField, + sort, reverse); + } catch (IOException e) { + if (NutchBean.LOG.isWarnEnabled()) { + NutchBean.LOG.warn("Search Error", e); + } + hits = new Hits(0,new Hit[0]); + } + + if (NutchBean.LOG.isInfoEnabled()) { + NutchBean.LOG.info("total hits: " + hits.getTotal()); + } + + // generate xml results + int end = (int)Math.min(hits.getLength(), start + hitsPerPage); + int length = end-start; + + Hit[] show = hits.getHits(start, end-start); + HitDetails[] details = bean.getDetails(show); + Summary[] summaries = bean.getSummary(details, query); + + String requestUrl = request.getRequestURL().toString(); + String base = requestUrl.substring(0, requestUrl.lastIndexOf('/')); + + + try { + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + factory.setNamespaceAware(true); + Document doc = factory.newDocumentBuilder().newDocument(); + + Element rss = addNode(doc, doc, "rss"); + addAttribute(doc, rss, "version", "2.0"); + addAttribute(doc, rss, "xmlns:opensearch", + (String)NS_MAP.get("opensearch")); + addAttribute(doc, rss, "xmlns:nutch", (String)NS_MAP.get("nutch")); + + Element channel = addNode(doc, rss, "channel"); + + addNode(doc, channel, "title", "Nutch: " + queryString); + addNode(doc, channel, "description", "Nutch search results for query: " + + queryString); + addNode(doc, channel, "link", + base+"/search.jsp" + +"?query="+urlQuery + +"&start="+start + +"&hitsPerDup="+hitsPerDup + +params); + + addNode(doc, channel, "opensearch", "totalResults", ""+hits.getTotal()); + addNode(doc, channel, "opensearch", "startIndex", ""+start); + addNode(doc, channel, "opensearch", "itemsPerPage", ""+hitsPerPage); + + addNode(doc, channel, "nutch", "query", queryString); + + + if ((hits.totalIsExact() && end < hits.getTotal()) // more hits to show + || (!hits.totalIsExact() && (hits.getLength() > start+hitsPerPage))){ + addNode(doc, channel, "nutch", "nextPage", requestUrl + +"?query="+urlQuery + +"&start="+end + +"&hitsPerDup="+hitsPerDup + +params); + } + + if ((!hits.totalIsExact() && (hits.getLength() <= start+hitsPerPage))) { + addNode(doc, channel, "nutch", "showAllHits", requestUrl + +"?query="+urlQuery + +"&hitsPerDup="+0 + +params); + } + + for (int i = 0; i < length; i++) { + Hit hit = show[i]; + HitDetails detail = details[i]; + String title = detail.getValue("title"); + String url = detail.getValue("url"); + String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getIndexDocNo(); + + if (title == null || title.equals("")) { // use url for docs w/o title + title = url; + } + + Element item = addNode(doc, channel, "item"); + + addNode(doc, item, "title", title); + if (summaries[i] != null) { + addNode(doc, item, "description", summaries[i].toString() ); + } + addNode(doc, item, "link", url); + + addNode(doc, item, "nutch", "site", hit.getDedupValue()); + + addNode(doc, item, "nutch", "cache", base+"/cached.jsp?"+id); + addNode(doc, item, "nutch", "explain", base+"/explain.jsp?"+id + +"&query="+urlQuery+"&lang="+queryLang); + + if (hit.moreFromDupExcluded()) { + addNode(doc, item, "nutch", "moreFromSite", requestUrl + +"?query=" + +URLEncoder.encode("site:"+hit.getDedupValue() + +" "+queryString, "UTF-8") + +"&hitsPerSite="+0 + +params); + } + + for (int j = 0; j < detail.getLength(); j++) { // add all from detail + String field = detail.getField(j); + if (!SKIP_DETAILS.contains(field)) + addNode(doc, item, "nutch", field, detail.getValue(j)); + } + } + + // dump DOM tree + + DOMSource source = new DOMSource(doc); + TransformerFactory transFactory = TransformerFactory.newInstance(); + Transformer transformer = transFactory.newTransformer(); + transformer.setOutputProperty("indent", "yes"); + StreamResult result = new StreamResult(response.getOutputStream()); + response.setContentType("text/xml"); + transformer.transform(source, result); + + } catch (javax.xml.parsers.ParserConfigurationException e) { + throw new ServletException(e); + } catch (javax.xml.transform.TransformerException e) { + throw new ServletException(e); + } + + } + + private static Element addNode(Document doc, Node parent, String name) { + Element child = doc.createElement(name); + parent.appendChild(child); + return child; + } + + private static void addNode(Document doc, Node parent, + String name, String text) { + Element child = doc.createElement(name); + child.appendChild(doc.createTextNode(getLegalXml(text))); + parent.appendChild(child); + } + + private static void addNode(Document doc, Node parent, + String ns, String name, String text) { + Element child = doc.createElementNS((String)NS_MAP.get(ns), ns+":"+name); + child.appendChild(doc.createTextNode(getLegalXml(text))); + parent.appendChild(child); + } + + private static void addAttribute(Document doc, Element node, + String name, String value) { + Attr attribute = doc.createAttribute(name); + attribute.setValue(getLegalXml(value)); + node.getAttributes().setNamedItem(attribute); + } + + /* + * Ensure string is legal xml. + * @param text String to verify. + * @return Passed <code>text</code> or a new string with illegal + * characters removed if any found in <code>text</code>. + * @see http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char + */ + protected static String getLegalXml(final String text) { + if (text == null) { + return null; + } + StringBuffer buffer = null; + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + if (!isLegalXml(c)) { + if (buffer == null) { + // Start up a buffer. Copy characters here from now on + // now we've found at least one bad character in original. + buffer = new StringBuffer(text.length()); + buffer.append(text.substring(0, i)); + } + } else { + if (buffer != null) { + buffer.append(c); + } + } + } + return (buffer != null)? buffer.toString(): text; + } + + private static boolean isLegalXml(final char c) { + return c == 0x9 || c == 0xa || c == 0xd || (c >= 0x20 && c <= 0xd7ff) + || (c >= 0xe000 && c <= 0xfffd) || (c >= 0x10000 && c <= 0x10ffff); + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |