Thread: [Archive-access-cvs] SF.net SVN: archive-access:[2687] trunk/archive-access/projects/nutchwax/ arc

Brought to you by: binzino, bradtofel, gojomo, ia_igor, and 5 others

archive-access-cvs

[Archive-access-cvs] SF.net SVN: archive-access:[2687] trunk/archive-access/projects/nutchwax/ archive/src/java/org/archive/nutchwax/OpenSearchServlet.java

From: <bi...@us...> - 2009-03-03 18:20:21

Revision: 2687
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2687&view=rev
Author:   binzino
Date:     2009-03-03 18:20:14 +0000 (Tue, 03 Mar 2009)

Log Message:
-----------
Fixed handling of start and end of search results so that we detect
"paging off the end" and return an empty result set rather than an
exception.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchServlet.java

Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchServlet.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchServlet.java	2009-02-28 01:26:25 UTC (rev 2686)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchServlet.java	2009-03-03 18:20:14 UTC (rev 2687)
@@ -162,18 +162,30 @@
 
     responseTime = System.nanoTime( ) - responseTime;
 
-    // generate xml results
-    int end = (int)Math.min(hits.getLength(), start + hitsPerPage);
-    int length = end-start;
+    // The 'end' is usually just the end of the current page
+    // (start+hitsPerPage); but if we are on the last page
+    // of de-duped results, then the end is hits.getLength().
+    int end = Math.min( hits.getLength( ), start + hitsPerPage );
 
-    Hit[] show = hits.getHits(start, end-start);
-    HitDetails[] details = bean.getDetails(show);
-    Summary[] summaries = bean.getSummary(details, query);
+    // The length is usually just (end-start), unless the start
+    // position is past the end of the results -- which is common when
+    // de-duping.  The user could easily jump past the true end of the
+    // de-dup'd results.  If the start is past the end, we use a
+    // length of '0' to produce an empty results page.
+    int length = Math.max( end-start, 0 );
 
+    // Usually, the total results is the total number of non-de-duped
+    // results.  Howerver, if we are on last page of de-duped results,
+    // then we know our de-dup'd total is hits.getLength().
+    long totalResults = hits.getLength( ) < (start+hitsPerPage) ? hits.getLength( ) : hits.getTotal( );
+
+    Hit[]        show      = hits.getHits(start, length );
+    HitDetails[] details   = bean.getDetails(show);
+    Summary[]    summaries = bean.getSummary(details, query);
+
     String requestUrl = request.getRequestURL().toString();
     String base = requestUrl.substring(0, requestUrl.lastIndexOf('/'));
       
-
     try {
       DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
       factory.setNamespaceAware(true);
@@ -197,8 +209,8 @@
               +"&hitsPerDup="+hitsPerDup
               +params);
 
-      addNode(doc, channel, "opensearch", "totalResults", ""+hits.getTotal());
-      addNode(doc, channel, "opensearch", "startIndex", ""+start);
+      addNode(doc, channel, "opensearch", "totalResults", ""+totalResults);
+      addNode(doc, channel, "opensearch", "startIndex",   ""+start);
       addNode(doc, channel, "opensearch", "itemsPerPage", ""+hitsPerPage);
 
       addNode(doc, channel, "nutch", "query", queryString);


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2844] trunk/archive-access/projects/nutchwax/ archive/src/java/org/archive/nutchwax/OpenSearchServlet.java

From: <bi...@us...> - 2009-10-27 23:17:30

Revision: 2844
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2844&view=rev
Author:   binzino
Date:     2009-10-27 23:17:15 +0000 (Tue, 27 Oct 2009)

Log Message:
-----------
Updated to Nutch 1.0 API.  Also added use of Java generics to avoid type-casts.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchServlet.java

Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchServlet.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchServlet.java	2009-10-27 23:00:46 UTC (rev 2843)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchServlet.java	2009-10-27 23:17:15 UTC (rev 2844)
@@ -53,7 +53,7 @@
  */   
 public class OpenSearchServlet extends HttpServlet 
 {
-  private static final Map NS_MAP = new HashMap();
+  private static final Map<String,String> NS_MAP = new HashMap<String,String>();
   private int MAX_HITS_PER_PAGE;
 
   static {
@@ -61,7 +61,7 @@
     NS_MAP.put("nutch", "http://www.nutch.org/opensearchrss/1.0/");
   }
 
-  private static final Set SKIP_DETAILS = new HashSet();
+  private static final Set<String> SKIP_DETAILS = new HashSet<String>();
   static {
     SKIP_DETAILS.add("url");                   // redundant with RSS link
     SKIP_DETAILS.add("title");                 // redundant with RSS title
@@ -92,9 +92,8 @@
     // get parameters from request
     request.setCharacterEncoding("UTF-8");
     String queryString = request.getParameter("query");
-    if (queryString == null)
-      queryString = "";
-    String urlQuery = URLEncoder.encode(queryString, "UTF-8");
+    if (queryString == null) queryString = "";
+    //String urlQuery = URLEncoder.encode(queryString, "UTF-8");
     
     // the query language
     String queryLang = request.getParameter("lang");
@@ -133,12 +132,6 @@
         }
     }
      
-    // Make up query string for use later drawing the 'rss' logo.
-    String params = "&hitsPerPage=" + hitsPerPage +
-        (queryLang == null ? "" : "&lang=" + queryLang) +
-        (sort == null ? "" : "&sort=" + sort + (reverse? "&reverse=true": "") +
-        (dedupField == null ? "" : "&dedupField=" + dedupField));
-
     Query query = Query.parse(queryString, queryLang, this.conf);
     if (NutchBean.LOG.isInfoEnabled()) {
       NutchBean.LOG.info("query: " + queryString);
@@ -183,9 +176,6 @@
     HitDetails[] details   = bean.getDetails(show);
     Summary[]    summaries = bean.getSummary(details, query);
 
-    String requestUrl = request.getRequestURL().toString();
-    String base = requestUrl.substring(0, requestUrl.lastIndexOf('/'));
-      
     try {
       DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
       factory.setNamespaceAware(true);
@@ -194,20 +184,14 @@
       Element rss = addNode(doc, doc, "rss");
       addAttribute(doc, rss, "version", "2.0");
       addAttribute(doc, rss, "xmlns:opensearch",
-                   (String)NS_MAP.get("opensearch"));
-      addAttribute(doc, rss, "xmlns:nutch", (String)NS_MAP.get("nutch"));
+                   NS_MAP.get("opensearch"));
+      addAttribute(doc, rss, "xmlns:nutch", NS_MAP.get("nutch"));
 
       Element channel = addNode(doc, rss, "channel");
     
       addNode(doc, channel, "title", "Nutch: " + queryString);
-      addNode(doc, channel, "description", "Nutch search results for query: "
-              + queryString);
-      addNode(doc, channel, "link",
-              base+"/search.jsp"
-              +"?query="+urlQuery
-              +"&start="+start
-              +"&hitsPerDup="+hitsPerDup
-              +params);
+      addNode(doc, channel, "description", "Nutch search results for query: " + queryString);
+      addNode(doc, channel, "link", "" );
 
       addNode(doc, channel, "opensearch", "totalResults", ""+totalResults);
       addNode(doc, channel, "opensearch", "startIndex",   ""+start);
@@ -217,7 +201,7 @@
       addNode(doc, channel, "nutch", "responseTime", Double.toString( ((long) responseTime / 1000 / 1000 ) / 1000.0 ) );
 
       // Add a <nutch:urlParams> element containing a list of all the URL parameters.
-      Element urlParams = doc.createElementNS((String)NS_MAP.get("nutch"), "nutch:urlParams" );
+      Element urlParams = doc.createElementNS(NS_MAP.get("nutch"), "nutch:urlParams" );
       channel.appendChild( urlParams );
 
       for ( Map.Entry<String,String[]> e : ((Map<String,String[]>) request.getParameterMap( )).entrySet( ) )
@@ -225,43 +209,19 @@
           String key = e.getKey( );
           for ( String value : e.getValue( ) )
             {
-              Element urlParam = doc.createElementNS((String)NS_MAP.get("nutch"), "nutch:param" );
+              Element urlParam = doc.createElementNS(NS_MAP.get("nutch"), "nutch:param" );
               addAttribute( doc, urlParam, "name",  key   );
               addAttribute( doc, urlParam, "value", value );
               urlParams.appendChild(urlParam);
             }
         }
 
-      // Hmm, we should indicate whether or not the "totalResults"
-      // number as being exact some other way; perhaps just have a
-      // <nutch:totalIsExact>true</nutch:totalIsExact> element.
-      /*
-      if ((hits.totalIsExact() && end < hits.getTotal()) // more hits to show
-          || (!hits.totalIsExact() && (hits.getLength() > start+hitsPerPage))){
-        addNode(doc, channel, "nutch", "nextPage", requestUrl
-                +"?query="+urlQuery
-                +"&start="+end
-                +"&hitsPerDup="+hitsPerDup
-                +params);
-      }
-      */
-
-      // Same here, this seems odd.
-      /*
-      if ((!hits.totalIsExact() && (hits.getLength() <= start+hitsPerPage))) {
-        addNode(doc, channel, "nutch", "showAllHits", requestUrl
-                +"?query="+urlQuery
-                +"&hitsPerDup="+0
-                +params);
-      }
-      */
-
       for (int i = 0; i < length; i++) {
         Hit hit = show[i];
         HitDetails detail = details[i];
         String title = detail.getValue("title");
         String url = detail.getValue("url");
-        String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getIndexDocNo();
+        String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getUniqueKey();
       
         if (title == null || title.equals("")) {   // use url for docs w/o title
           title = url;
@@ -274,24 +234,8 @@
           addNode(doc, item, "description", summaries[i].toString() );
         }
         addNode(doc, item, "link", url);
-
         addNode(doc, item, "nutch", "site", hit.getDedupValue());
 
-        addNode(doc, item, "nutch", "cache", base+"/cached.jsp?"+id);
-        addNode(doc, item, "nutch", "explain", base+"/explain.jsp?"+id
-                +"&query="+urlQuery+"&lang="+queryLang);
-
-        // Probably don't need this as the XML processor/front-end can
-        // easily do this themselves.
-        if (hit.moreFromDupExcluded()) {
-          addNode(doc, item, "nutch", "moreFromSite", requestUrl
-                  +"?query="
-                  +URLEncoder.encode("site:"+hit.getDedupValue()
-                                     +" "+queryString, "UTF-8")
-                  +"&hitsPerSite="+0
-                  +params);
-        }
-
         for (int j = 0; j < detail.getLength(); j++) { // add all from detail
           String field = detail.getField(j);
           if (!SKIP_DETAILS.contains(field))
@@ -304,9 +248,9 @@
       DOMSource source = new DOMSource(doc);
       TransformerFactory transFactory = TransformerFactory.newInstance();
       Transformer transformer = transFactory.newTransformer();
-      transformer.setOutputProperty("indent", "yes");
+      transformer.setOutputProperty( javax.xml.transform.OutputKeys.ENCODING, "UTF-8" );
       StreamResult result = new StreamResult(response.getOutputStream());
-      response.setContentType("text/xml");
+      response.setContentType("application/rss+xml");
       transformer.transform(source, result);
 
     } catch (javax.xml.parsers.ParserConfigurationException e) {
@@ -334,7 +278,7 @@
   private static void addNode(Document doc, Node parent,
                               String ns, String name, String text) {
     if ( text == null ) text = "";
-    Element child = doc.createElementNS((String)NS_MAP.get(ns), ns+":"+name);
+    Element child = doc.createElementNS(NS_MAP.get(ns), ns+":"+name);
     child.appendChild(doc.createTextNode(getLegalXml(text)));
     parent.appendChild(child);
   }


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2962] trunk/archive-access/projects/nutchwax/ archive/src/java/org/archive/nutchwax/OpenSearchServlet.java

From: <bi...@us...> - 2010-02-22 05:19:04

Revision: 2962
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2962&view=rev
Author:   binzino
Date:     2010-02-22 05:18:57 +0000 (Mon, 22 Feb 2010)

Log Message:
-----------
Added result score to OpenSearch output.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchServlet.java

Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchServlet.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchServlet.java	2010-02-22 05:18:00 UTC (rev 2961)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchServlet.java	2010-02-22 05:18:57 UTC (rev 2962)
@@ -46,6 +46,7 @@
 import org.apache.nutch.searcher.NutchBean;
 import org.apache.nutch.searcher.Query;
 import org.apache.nutch.searcher.Summary;
+import org.apache.hadoop.io.FloatWritable;
 
 /** 
  * Present search results using A9's OpenSearch extensions to RSS,
@@ -183,9 +184,8 @@
  
       Element rss = addNode(doc, doc, "rss");
       addAttribute(doc, rss, "version", "2.0");
-      addAttribute(doc, rss, "xmlns:opensearch",
-                   NS_MAP.get("opensearch"));
-      addAttribute(doc, rss, "xmlns:nutch", NS_MAP.get("nutch"));
+      addAttribute(doc, rss, "xmlns:opensearch", NS_MAP.get("opensearch"));
+      addAttribute(doc, rss, "xmlns:nutch",      NS_MAP.get("nutch"));
 
       Element channel = addNode(doc, rss, "channel");
     
@@ -201,7 +201,7 @@
       addNode(doc, channel, "nutch", "responseTime", Double.toString( ((long) responseTime / 1000 / 1000 ) / 1000.0 ) );
 
       // Add a <nutch:urlParams> element containing a list of all the URL parameters.
-      Element urlParams = doc.createElementNS(NS_MAP.get("nutch"), "nutch:urlParams" );
+      Element urlParams = doc.createElementNS( NS_MAP.get("nutch"), "nutch:urlParams" );
       channel.appendChild( urlParams );
 
       for ( Map.Entry<String,String[]> e : ((Map<String,String[]>) request.getParameterMap( )).entrySet( ) )
@@ -219,9 +219,9 @@
       for (int i = 0; i < length; i++) {
         Hit hit = show[i];
         HitDetails detail = details[i];
+        String score = Float.toString( ((FloatWritable)hit.getSortValue( )).get() );
         String title = detail.getValue("title");
-        String url = detail.getValue("url");
-        String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getUniqueKey();
+        String url   = detail.getValue("url");
       
         if (title == null || title.equals("")) {   // use url for docs w/o title
           title = url;
@@ -229,6 +229,7 @@
         
         Element item = addNode(doc, channel, "item");
 
+        addNode(doc, item, "nutch", "score", score );
         addNode(doc, item, "title", title);
         if (summaries[i] != null) {
           addNode(doc, item, "description", summaries[i].toString() );


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access:[2963] trunk/archive-access/projects/nutchwax/ archive/src/java/org/archive/nutchwax/OpenSearchServlet.java

From: <bi...@us...> - 2010-02-22 22:19:48

Revision: 2963
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2963&view=rev
Author:   binzino
Date:     2010-02-22 22:19:42 +0000 (Mon, 22 Feb 2010)

Log Message:
-----------
Removed extra 'nutch:' prefix from urlParams and param elements in output.

Modified Paths:
--------------
    trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchServlet.java

Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchServlet.java
===================================================================
--- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchServlet.java	2010-02-22 05:18:57 UTC (rev 2962)
+++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/OpenSearchServlet.java	2010-02-22 22:19:42 UTC (rev 2963)
@@ -201,7 +201,7 @@
       addNode(doc, channel, "nutch", "responseTime", Double.toString( ((long) responseTime / 1000 / 1000 ) / 1000.0 ) );
 
       // Add a <nutch:urlParams> element containing a list of all the URL parameters.
-      Element urlParams = doc.createElementNS( NS_MAP.get("nutch"), "nutch:urlParams" );
+      Element urlParams = doc.createElementNS( NS_MAP.get("nutch"), "urlParams" );
       channel.appendChild( urlParams );
 
       for ( Map.Entry<String,String[]> e : ((Map<String,String[]>) request.getParameterMap( )).entrySet( ) )
@@ -209,7 +209,7 @@
           String key = e.getKey( );
           for ( String value : e.getValue( ) )
             {
-              Element urlParam = doc.createElementNS(NS_MAP.get("nutch"), "nutch:param" );
+              Element urlParam = doc.createElementNS(NS_MAP.get("nutch"), "param" );
               addAttribute( doc, urlParam, "name",  key   );
               addAttribute( doc, urlParam, "value", value );
               urlParams.appendChild(urlParam);


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.