From: <bi...@us...> - 2008-12-15 02:11:18
|
Revision: 2664 http://archive-access.svn.sourceforge.net/archive-access/?rev=2664&view=rev Author: binzino Date: 2008-12-15 01:47:48 +0000 (Mon, 15 Dec 2008) Log Message: ----------- Added own version of OpenSerach servlet which adds some XML elements and has a few other enhancements. Also revised the sample XSLT to take advantage of these changes in the OpenSearch servlet. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/web/style/search.xsl Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchServlet.java Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchServlet.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchServlet.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchServlet.java 2008-12-15 01:47:48 UTC (rev 2664) @@ -0,0 +1,372 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.archive.nutchwax; + +import java.io.IOException; +import java.net.URLEncoder; +import java.util.Map; +import java.util.HashMap; +import java.util.Set; +import java.util.HashSet; + +import javax.servlet.ServletException; +import javax.servlet.ServletConfig; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import javax.xml.parsers.*; + +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.util.NutchConfiguration; +import org.w3c.dom.*; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.Transformer; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; + +import org.apache.nutch.searcher.Hit; +import org.apache.nutch.searcher.HitDetails; +import org.apache.nutch.searcher.Hits; +import org.apache.nutch.searcher.NutchBean; +import org.apache.nutch.searcher.Query; +import org.apache.nutch.searcher.Summary; + +/** + * Present search results using A9's OpenSearch extensions to RSS, + * plus a few Nutch-specific extensions. + */ +public class OpenSearchServlet extends HttpServlet +{ + private static final Map NS_MAP = new HashMap(); + private int MAX_HITS_PER_PAGE; + + static { + NS_MAP.put("opensearch", "http://a9.com/-/spec/opensearchrss/1.0/"); + NS_MAP.put("nutch", "http://www.nutch.org/opensearchrss/1.0/"); + } + + private static final Set SKIP_DETAILS = new HashSet(); + static { + SKIP_DETAILS.add("url"); // redundant with RSS link + SKIP_DETAILS.add("title"); // redundant with RSS title + } + + private NutchBean bean; + private Configuration conf; + + public void init(ServletConfig config) throws ServletException { + try { + this.conf = NutchConfiguration.get(config.getServletContext()); + bean = NutchBean.get(config.getServletContext(), this.conf); + } catch (IOException e) { + throw new ServletException(e); + } + MAX_HITS_PER_PAGE = conf.getInt("searcher.max.hits.per.page", -1); + } + + public void doGet(HttpServletRequest request, HttpServletResponse response) + throws ServletException, IOException { + + long responseTime = System.nanoTime( ); + + if (NutchBean.LOG.isInfoEnabled()) { + NutchBean.LOG.info("query request from " + request.getRemoteAddr()); + } + + // get parameters from request + request.setCharacterEncoding("UTF-8"); + String queryString = request.getParameter("query"); + if (queryString == null) + queryString = ""; + String urlQuery = URLEncoder.encode(queryString, "UTF-8"); + + // the query language + String queryLang = request.getParameter("lang"); + + int start = 0; // first hit to display + String startString = request.getParameter("start"); + if (startString != null) + start = Integer.parseInt(startString); + + int hitsPerPage = 10; // number of hits to display + String hitsString = request.getParameter("hitsPerPage"); + if (hitsString != null) + hitsPerPage = Integer.parseInt(hitsString); + if(MAX_HITS_PER_PAGE > 0 && hitsPerPage > MAX_HITS_PER_PAGE) + hitsPerPage = MAX_HITS_PER_PAGE; + + String sort = request.getParameter("sort"); + boolean reverse = sort != null && "true".equals(request.getParameter("reverse")); + + // De-Duplicate handling. Look for duplicates field and for how many + // duplicates per results to return. Default duplicates field is 'site' + // and duplicates per results default is '2'. + String dedupField = request.getParameter("dedupField"); + if (dedupField == null || dedupField.length() == 0) { + dedupField = "site"; + } + int hitsPerDup = 2; + String hitsPerDupString = request.getParameter("hitsPerDup"); + String hitsPerSiteString = request.getParameter("hitsPerSite"); + if (hitsPerDupString != null && hitsPerDupString.length() > 0) { + hitsPerDup = Integer.parseInt(hitsPerDupString); + } else { + // If 'hitsPerSite' present, use that value. + if (hitsPerSiteString != null && hitsPerSiteString.length() > 0) { + hitsPerDup = Integer.parseInt(hitsPerSiteString); + } + } + + // Make up query string for use later drawing the 'rss' logo. + String params = "&hitsPerPage=" + hitsPerPage + + (queryLang == null ? "" : "&lang=" + queryLang) + + (sort == null ? "" : "&sort=" + sort + (reverse? "&reverse=true": "") + + (dedupField == null ? "" : "&dedupField=" + dedupField)); + + Query query = Query.parse(queryString, queryLang, this.conf); + if (NutchBean.LOG.isInfoEnabled()) { + NutchBean.LOG.info("query: " + queryString); + NutchBean.LOG.info("lang: " + queryLang); + } + + // execute the query + Hits hits; + try { + hits = bean.search(query, start + hitsPerPage, hitsPerDup, dedupField, sort, reverse); + } catch (IOException e) { + if (NutchBean.LOG.isWarnEnabled()) { + NutchBean.LOG.warn("Search Error", e); + } + hits = new Hits(0,new Hit[0]); + } + + if (NutchBean.LOG.isInfoEnabled()) { + NutchBean.LOG.info("total hits: " + hits.getTotal()); + } + + responseTime = System.nanoTime( ) - responseTime; + + // generate xml results + int end = (int)Math.min(hits.getLength(), start + hitsPerPage); + int length = end-start; + + Hit[] show = hits.getHits(start, end-start); + HitDetails[] details = bean.getDetails(show); + Summary[] summaries = bean.getSummary(details, query); + + String requestUrl = request.getRequestURL().toString(); + String base = requestUrl.substring(0, requestUrl.lastIndexOf('/')); + + + try { + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + factory.setNamespaceAware(true); + Document doc = factory.newDocumentBuilder().newDocument(); + + Element rss = addNode(doc, doc, "rss"); + addAttribute(doc, rss, "version", "2.0"); + addAttribute(doc, rss, "xmlns:opensearch", + (String)NS_MAP.get("opensearch")); + addAttribute(doc, rss, "xmlns:nutch", (String)NS_MAP.get("nutch")); + + Element channel = addNode(doc, rss, "channel"); + + addNode(doc, channel, "title", "Nutch: " + queryString); + addNode(doc, channel, "description", "Nutch search results for query: " + + queryString); + addNode(doc, channel, "link", + base+"/search.jsp" + +"?query="+urlQuery + +"&start="+start + +"&hitsPerDup="+hitsPerDup + +params); + + addNode(doc, channel, "opensearch", "totalResults", ""+hits.getTotal()); + addNode(doc, channel, "opensearch", "startIndex", ""+start); + addNode(doc, channel, "opensearch", "itemsPerPage", ""+hitsPerPage); + + addNode(doc, channel, "nutch", "query", queryString); + addNode(doc, channel, "nutch", "responseTime", Double.toString( ((long) responseTime / 1000 / 1000 ) / 1000.0 ) ); + + // Add a <nutch:urlParams> element containing a list of all the URL parameters. + Element urlParams = doc.createElementNS((String)NS_MAP.get("nutch"), "nutch:urlParams" ); + channel.appendChild( urlParams ); + + for ( Map.Entry<String,String[]> e : ((Map<String,String[]>) request.getParameterMap( )).entrySet( ) ) + { + String key = e.getKey( ); + for ( String value : e.getValue( ) ) + { + Element urlParam = doc.createElementNS((String)NS_MAP.get("nutch"), "nutch:param" ); + addAttribute( doc, urlParam, "name", key ); + addAttribute( doc, urlParam, "value", value ); + urlParams.appendChild(urlParam); + } + } + + // Hmm, we should indicate whether or not the "totalResults" + // number as being exact some other way; perhaps just have a + // <nutch:totalIsExact>true</nutch:totalIsExact> element. + /* + if ((hits.totalIsExact() && end < hits.getTotal()) // more hits to show + || (!hits.totalIsExact() && (hits.getLength() > start+hitsPerPage))){ + addNode(doc, channel, "nutch", "nextPage", requestUrl + +"?query="+urlQuery + +"&start="+end + +"&hitsPerDup="+hitsPerDup + +params); + } + */ + + // Same here, this seems odd. + /* + if ((!hits.totalIsExact() && (hits.getLength() <= start+hitsPerPage))) { + addNode(doc, channel, "nutch", "showAllHits", requestUrl + +"?query="+urlQuery + +"&hitsPerDup="+0 + +params); + } + */ + + for (int i = 0; i < length; i++) { + Hit hit = show[i]; + HitDetails detail = details[i]; + String title = detail.getValue("title"); + String url = detail.getValue("url"); + String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getIndexDocNo(); + + if (title == null || title.equals("")) { // use url for docs w/o title + title = url; + } + + Element item = addNode(doc, channel, "item"); + + addNode(doc, item, "title", title); + if (summaries[i] != null) { + addNode(doc, item, "description", summaries[i].toString() ); + } + addNode(doc, item, "link", url); + + addNode(doc, item, "nutch", "site", hit.getDedupValue()); + + addNode(doc, item, "nutch", "cache", base+"/cached.jsp?"+id); + addNode(doc, item, "nutch", "explain", base+"/explain.jsp?"+id + +"&query="+urlQuery+"&lang="+queryLang); + + // Probably don't need this as the XML processor/front-end can + // easily do this themselves. + if (hit.moreFromDupExcluded()) { + addNode(doc, item, "nutch", "moreFromSite", requestUrl + +"?query=" + +URLEncoder.encode("site:"+hit.getDedupValue() + +" "+queryString, "UTF-8") + +"&hitsPerSite="+0 + +params); + } + + for (int j = 0; j < detail.getLength(); j++) { // add all from detail + String field = detail.getField(j); + if (!SKIP_DETAILS.contains(field)) + addNode(doc, item, "nutch", field, detail.getValue(j)); + } + } + + // dump DOM tree + + DOMSource source = new DOMSource(doc); + TransformerFactory transFactory = TransformerFactory.newInstance(); + Transformer transformer = transFactory.newTransformer(); + transformer.setOutputProperty("indent", "yes"); + StreamResult result = new StreamResult(response.getOutputStream()); + response.setContentType("text/xml"); + transformer.transform(source, result); + + } catch (javax.xml.parsers.ParserConfigurationException e) { + throw new ServletException(e); + } catch (javax.xml.transform.TransformerException e) { + throw new ServletException(e); + } + + } + + private static Element addNode(Document doc, Node parent, String name) { + Element child = doc.createElement(name); + parent.appendChild(child); + return child; + } + + private static void addNode(Document doc, Node parent, + String name, String text) { + if ( text == null ) text = ""; + Element child = doc.createElement(name); + child.appendChild(doc.createTextNode(getLegalXml(text))); + parent.appendChild(child); + } + + private static void addNode(Document doc, Node parent, + String ns, String name, String text) { + if ( text == null ) text = ""; + Element child = doc.createElementNS((String)NS_MAP.get(ns), ns+":"+name); + child.appendChild(doc.createTextNode(getLegalXml(text))); + parent.appendChild(child); + } + + private static void addAttribute(Document doc, Element node, + String name, String value) { + Attr attribute = doc.createAttribute(name); + attribute.setValue(getLegalXml(value)); + node.getAttributes().setNamedItem(attribute); + } + + /* + * Ensure string is legal xml. + * @param text String to verify. + * @return Passed <code>text</code> or a new string with illegal + * characters removed if any found in <code>text</code>. + * @see http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char + */ + protected static String getLegalXml(final String text) { + if (text == null) { + return null; + } + StringBuffer buffer = null; + for (int i = 0; i < text.length(); i++) { + char c = text.charAt(i); + if (!isLegalXml(c)) { + if (buffer == null) { + // Start up a buffer. Copy characters here from now on + // now we've found at least one bad character in original. + buffer = new StringBuffer(text.length()); + buffer.append(text.substring(0, i)); + } + } else { + if (buffer != null) { + buffer.append(c); + } + } + } + return (buffer != null)? buffer.toString(): text; + } + + private static boolean isLegalXml(final char c) { + return c == 0x9 || c == 0xa || c == 0xd || (c >= 0x20 && c <= 0xd7ff) + || (c >= 0xe000 && c <= 0xfffd) || (c >= 0x10000 && c <= 0x10ffff); + } + +} Modified: trunk/archive-access/projects/nutchwax/archive/src/web/style/search.xsl =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/web/style/search.xsl 2008-12-14 21:10:33 UTC (rev 2663) +++ trunk/archive-access/projects/nutchwax/archive/src/web/style/search.xsl 2008-12-15 01:47:48 UTC (rev 2664) @@ -115,42 +115,49 @@ <span class="searchFields"> Search for <input id="query" name="query" type="text" size="40" value="{nutch:query}" /> + + <!-- Create hidden form fields for the rest of the URL parameters --> + <xsl:for-each select="nutch:urlParams/nutch:param[@name!='start' and @name!='query']"> + <xsl:element name="input" namespace="http://www.w3.org/1999/xhtml"> + <xsl:attribute name="type">hidden</xsl:attribute> + <xsl:attribute name="name" ><xsl:value-of select="@name" /></xsl:attribute> + <xsl:attribute name="value"><xsl:value-of select="@value" /></xsl:attribute> + </xsl:element> + </xsl:for-each> + <input type="submit" value="Search"/> </span> </form> </div> - <div style="font-size: 8pt; margin:0; padding:0 0 0.5em 0;">Results <xsl:value-of select="opensearch:startIndex + 1" />-<xsl:value-of select="opensearch:startIndex + opensearch:itemsPerPage" /> of about <xsl:value-of select="opensearch:totalResults" /> <span style="margin-left: 1em;"><a href="{nutch:nextPage}">Next</a></span></div> + <div style="font-size: 8pt; margin:0; padding:0 0 0.5em 0;">Results <xsl:value-of select="opensearch:startIndex + 1" />-<xsl:value-of select="opensearch:startIndex + opensearch:itemsPerPage" /> of about <xsl:value-of select="opensearch:totalResults" /> <span style="margin-left: 1em;"></span></div> <!-- Search results --> <ol start="{opensearch:startIndex + 1}"> <xsl:apply-templates select="item" /> </ol> <!-- Generate list of page links --> <center> - <xsl:if test="(floor(opensearch:startIndex div opensearch:itemsPerPage) + 1) != 1"> - <a href="search?query={nutch:query}&start={(floor(opensearch:startIndex div opensearch:itemsPerPage) - 1) * opensearch:itemsPerPage}">«</a><xsl:text> </xsl:text> - </xsl:if> - <xsl:choose> - <xsl:when test="(floor(opensearch:startIndex div opensearch:itemsPerPage) + 1) < 11"> - <xsl:call-template name="pageLinks" > - <xsl:with-param name="begin" select="1" /> - <xsl:with-param name="end" select="21" /> - <xsl:with-param name="current" select="floor(opensearch:startIndex div opensearch:itemsPerPage) + 1" /> - </xsl:call-template> - </xsl:when> - <xsl:otherwise> - <xsl:call-template name="pageLinks" > - <xsl:with-param name="begin" select="floor(opensearch:startIndex div opensearch:itemsPerPage) + 1 - 10" /> - <xsl:with-param name="end" select="floor(opensearch:startIndex div opensearch:itemsPerPage) + 1 + 11" /> - <xsl:with-param name="current" select="floor(opensearch:startIndex div opensearch:itemsPerPage) + 1" /> - </xsl:call-template> - </xsl:otherwise> - </xsl:choose> - <a href="{nutch:nextPage}">»</a> + <xsl:call-template name="pageLinks"> + <xsl:with-param name="labelPrevious" select="'«'" /> + <xsl:with-param name="labelNext" select="'»'" /> + </xsl:call-template> </center> </body> </html> </xsl:template> + +<!-- ====================================================================== + NutchWAX XSLT template/fuction library. + + The idea is that the above xhtml code is what most NutchWAX users + will modify to tailor to their own look and feel. The stuff + below implements the core logic for generating results lists, + page links, etc. + + Hopefully NutchWAX web developers will be able to easily edit the + above xhtml and css and won't have to change the below. + ====================================================================== --> + <!-- Template to emit a search result as an HTML list item (<li/>). --> <xsl:template match="item"> @@ -176,32 +183,99 @@ <xsl:value-of select="substring(.,1,4)" /><xsl:text>-</xsl:text><xsl:value-of select="substring(.,5,2)" /><xsl:text>-</xsl:text><xsl:value-of select="substring(.,7,2)" /><xsl:text> </xsl:text> </xsl:template> -<!-- Template to generate a list of numbered links to results pages. +<!-- Template to emit a list of numbered page links, *including* + "previous" and "next" links on either end, using the given labels. Parameters: + labelPrevious Link text for "previous page" link + labelNext Link text for "next page" link + --> +<xsl:template name="pageLinks"> + <xsl:param name="labelPrevious" /> + <xsl:param name="labelNext" /> + <!-- If we are on any page past the first, emit a "previous" link --> + <xsl:if test="(floor(opensearch:startIndex div opensearch:itemsPerPage) + 1) != 1"> + <xsl:call-template name="pageLink"> + <xsl:with-param name="pageNum" select="floor(opensearch:startIndex div opensearch:itemsPerPage)" /> + <xsl:with-param name="linkText" select="$labelPrevious" /> + </xsl:call-template> + <xsl:text> </xsl:text> + </xsl:if> + <!-- Now, emit numbered page links --> + <xsl:choose> + <xsl:when test="(floor(opensearch:startIndex div opensearch:itemsPerPage) + 1) < 11"> + <xsl:call-template name="numberedPageLinks" > + <xsl:with-param name="begin" select="1" /> + <xsl:with-param name="end" select="21" /> + <xsl:with-param name="current" select="floor(opensearch:startIndex div opensearch:itemsPerPage) + 1" /> + </xsl:call-template> + </xsl:when> + <xsl:otherwise> + <xsl:call-template name="numberedPageLinks" > + <xsl:with-param name="begin" select="floor(opensearch:startIndex div opensearch:itemsPerPage) + 1 - 10" /> + <xsl:with-param name="end" select="floor(opensearch:startIndex div opensearch:itemsPerPage) + 1 + 11" /> + <xsl:with-param name="current" select="floor(opensearch:startIndex div opensearch:itemsPerPage) + 1" /> + </xsl:call-template> + </xsl:otherwise> + </xsl:choose> + <!-- Lastly, emit a "next" link. --> + <xsl:text> </xsl:text> + <xsl:call-template name="pageLink"> + <xsl:with-param name="pageNum" select="floor(opensearch:startIndex div opensearch:itemsPerPage) + 2" /> + <xsl:with-param name="linkText" select="$labelNext" /> + </xsl:call-template> +</xsl:template> + +<!-- Template to emit a list of numbered links to results pages. + Parameters: begin starting # inclusive end ending # exclusive current the current page, don't emit a link --> -<xsl:template name="pageLinks"> +<xsl:template name="numberedPageLinks"> <xsl:param name="begin" /> <xsl:param name="end" /> <xsl:param name="current" /> <xsl:if test="$begin < $end"> - <xsl:choose> - <xsl:when test="$begin = $current" > - <xsl:value-of select="$current" /> - </xsl:when> - <xsl:otherwise> - <a href="?query={nutch:query}&start={($begin -1) * opensearch:itemsPerPage}&hitsPerPage={opensearch:itemsPerPage}"><xsl:value-of select="$begin" /></a> - </xsl:otherwise> - </xsl:choose> - <xsl:text> </xsl:text> - <xsl:call-template name="pageLinks"> - <xsl:with-param name="begin" select="$begin + 1" /> - <xsl:with-param name="end" select="$end" /> - <xsl:with-param name="current" select="$current" /> + <xsl:choose> + <xsl:when test="$begin = $current" > + <xsl:value-of select="$current" /> + </xsl:when> + <xsl:otherwise> + <xsl:call-template name="pageLink" > + <xsl:with-param name="pageNum" select="$begin" /> + <xsl:with-param name="linkText" select="$begin" /> </xsl:call-template> + </xsl:otherwise> + </xsl:choose> + <xsl:text> </xsl:text> + <xsl:call-template name="numberedPageLinks"> + <xsl:with-param name="begin" select="$begin + 1" /> + <xsl:with-param name="end" select="$end" /> + <xsl:with-param name="current" select="$current" /> + </xsl:call-template> </xsl:if> </xsl:template> +<!-- Template to emit a single page link. All of the URL parameters + listed in the OpenSearch results are included in the link. + Parmeters: + pageNum page number of the link + linkText text of the link + --> +<xsl:template name="pageLink"> + <xsl:param name="pageNum" /> + <xsl:param name="linkText" /> + <xsl:element name="a" namespace="http://www.w3.org/1999/xhtml"> + <xsl:attribute name="href"> + <xsl:text>?</xsl:text> + <xsl:for-each select="nutch:urlParams/nutch:param[@name!='start']"> + <xsl:value-of select="@name" /><xsl:text>=</xsl:text><xsl:value-of select="@value" /> + <xsl:text>&</xsl:text> + </xsl:for-each> + <xsl:text>start=</xsl:text><xsl:value-of select="($pageNum -1) * opensearch:itemsPerPage" /> + </xsl:attribute> + <xsl:value-of select="$linkText" /> + </xsl:element> +</xsl:template> + </xsl:stylesheet> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |