[Archive-access-cvs] archive-access/projects/nutch/src/java/org/archive/access/nutch NutchwaxOpenSea

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv16586/src/java/org/archive/access/nutch

Modified Files:
	NutchwaxOpenSearchServlet.java 
Log Message:
Fix for ' 1312212 ] bad xml chars in search results'
* src/java/org/archive/access/nutch/NutchwaxOpenSearchServlet.java
    I didn't want to bring into nutchwax a complete copy of OpenSearchServlet
    but have no choice if I want to fix bad xml bug.  Have submitted patch to
    nutch.  If it gets applied I'll remove the inclusion of the total servlet.
    Meantime, the below runs all text through a filter that looks for
    disallowed xml characters.

Index: NutchwaxOpenSearchServlet.java
===================================================================
RCS file: /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch/NutchwaxOpenSearchServlet.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** NutchwaxOpenSearchServlet.java	6 Oct 2005 17:35:02 -0000	1.3
--- NutchwaxOpenSearchServlet.java	13 Oct 2005 15:53:36 -0000	1.4
***************
*** 1,274 ****
! /* NutchwaxOpenSearchServlet.java
   *
!  * $Id$
   *
!  * Created Jul 26, 2005
   *
!  * Copyright (C) 2005 Internet Archive.
!  * 
!  * This file is part of the archive-access tools project
!  * (http://sourceforge.net/projects/archive-access).
!  * 
!  * The archive-access tools are free software; you can redistribute them and/or
!  * modify them under the terms of the GNU Lesser Public License as published by
!  * the Free Software Foundation; either version 2.1 of the License, or any
!  * later version.
!  * 
!  * The archive-access tools are distributed in the hope that they will be
!  * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
!  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
!  * Public License for more details.
!  * 
!  * You should have received a copy of the GNU Lesser Public License along with
!  * the archive-access tools; if not, write to the Free Software Foundation,
!  * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
   */
  package org.archive.access.nutch;

! import java.io.BufferedReader;
  import java.io.IOException;
! import java.io.UnsupportedEncodingException;
! import java.security.Principal;
! import java.util.Enumeration;
! import java.util.Locale;
  import java.util.Map;

- import javax.servlet.RequestDispatcher;
  import javax.servlet.ServletException;
! import javax.servlet.ServletInputStream;
! import javax.servlet.http.Cookie;
  import javax.servlet.http.HttpServletRequest;
  import javax.servlet.http.HttpServletResponse;
- import javax.servlet.http.HttpSession;
- 
- import org.apache.nutch.searcher.OpenSearchServlet;
- 
- public class NutchwaxOpenSearchServlet extends OpenSearchServlet {
-     public void doGet(final HttpServletRequest req,
-             final HttpServletResponse res)
-     throws ServletException, IOException {
-         // Make a delegating method that preprocesses the query string
-         // converting any exacturl values so they'll pass the NutchAnalysis.
-         HttpServletRequest delegatingReq = new HttpServletRequest() {
-             public String getParameter(String parameter) {
-                 String q = req.getParameter(parameter);
-                 return (parameter != null && parameter.equals("query"))?
-                     NutchwaxQuery.encodeExacturl(q): q;
-             }
-             
-             public String getAuthType() {
-                 return req.getAuthType();
-             }
- 
-             public Cookie[] getCookies() {
-                 return req.getCookies();
-             }
- 
-             public long getDateHeader(String arg0) {
-                 return req.getDateHeader(arg0);
-             }
- 
-             public String getHeader(String arg0) {
-                 return req.getHeader(arg0);
-             }
- 
-             public Enumeration getHeaders(String arg0) {
-                 return req.getHeaders(arg0);
-             }
- 
-             public Enumeration getHeaderNames() {
-                 return req.getHeaderNames();
-             }
- 
-             public int getIntHeader(String arg0) {
-                 return req.getIntHeader(arg0);
-             }
- 
-             public String getMethod() {
-                 return req.getMethod();
-             }
- 
-             public String getPathInfo() {
-                 return req.getPathInfo();
-             }

!             public String getPathTranslated() {
!                 return req.getPathTranslated();
!             }
! 
!             public String getContextPath() {
!                 return req.getContextPath();
!             }
! 
!             public String getQueryString() {
!                 return req.getQueryString();
!             }

-             public String getRemoteUser() {
-                 return req.getRemoteUser();
-             }

!             public boolean isUserInRole(String arg0) {
!                 return req.isUserInRole(arg0);
!             }

!             public Principal getUserPrincipal() {
!                 return req.getUserPrincipal();
!             }

!             public String getRequestedSessionId() {
!                 return req.getRequestedSessionId();
!             }

!             public String getRequestURI() {
!                 return req.getRequestURI();
!             }

!             public StringBuffer getRequestURL() {
!                 return req.getRequestURL();
!             }

!             public String getServletPath() {
!                 return req.getServletPath();
!             }

!             public HttpSession getSession(boolean arg0) {
!                 return req.getSession(arg0);
!             }

!             public HttpSession getSession() {
!                 return req.getSession();
!             }

!             public boolean isRequestedSessionIdValid() {
!                 return req.isRequestedSessionIdValid();
!             }

!             public boolean isRequestedSessionIdFromCookie() {
!                 return req.isRequestedSessionIdFromCookie();
!             }

!             public boolean isRequestedSessionIdFromURL() {
!                 return req.isRequestedSessionIdFromURL();
!             }

!             public boolean isRequestedSessionIdFromUrl() {
!                 return req.isRequestedSessionIdFromUrl();
!             }

!             public Object getAttribute(String arg0) {
!                 return req.getAttribute(arg0);
!             }

!             public Enumeration getAttributeNames() {
!                 return req.getAttributeNames();
!             }

!             public String getCharacterEncoding() {
!                 return req.getCharacterEncoding();
!             }

!             public void setCharacterEncoding(String arg0)
!             throws UnsupportedEncodingException {
!                 req.setCharacterEncoding(arg0);
!             }

!             public int getContentLength() {
!                 return req.getContentLength();
!             }

!             public String getContentType() {
!                 return req.getContentType();
!             }

!             public ServletInputStream getInputStream() throws IOException {
!                 return req.getInputStream();
!             }

!             public Enumeration getParameterNames() {
!                 return req.getParameterNames();
!             }

!             public String[] getParameterValues(String arg0) {
!                 return req.getParameterValues(arg0);
!             }

!             public Map getParameterMap() {
!                 return req.getParameterMap();
!             }

!             public String getProtocol() {
!                 return req.getProtocol();
!             }

!             public String getScheme() {
!                 return req.getScheme();
!             }

!             public String getServerName() {
!                 return req.getServerName();
!             }

!             public int getServerPort() {
!                 return req.getServerPort();
!             }

!             public BufferedReader getReader() throws IOException {
!                 return req.getReader();
!             }

!             public String getRemoteAddr() {
!                 return req.getRemoteAddr();
!             }

!             public String getRemoteHost() {
!                 return req.getRemoteHost();
!             }

!             public void setAttribute(String arg0, Object arg1) {
!                 req.setAttribute(arg0, arg1);
!             }

!             public void removeAttribute(String arg0) {
!                 req.removeAttribute(arg0);
!             }

!             public Locale getLocale() {
!                 return req.getLocale();
!             }

!             public Enumeration getLocales() {
!                 return req.getLocales();
!             }

!             public boolean isSecure() {
!                 return req.isSecure();
!             }

!             public RequestDispatcher getRequestDispatcher(String arg0) {
!                 return req.getRequestDispatcher(arg0);
!             }

!             public String getRealPath(String arg0) {
!                 return req.getRealPath(arg0);
!             }

!             public int getRemotePort() {
!                 return req.getRemotePort();
!             }

!             public String getLocalName() {
!                 return req.getLocalName();
!             }

!             public String getLocalAddr() {
!                 return req.getLocalAddr();
!             }

!             public int getLocalPort() {
!                 return req.getLocalPort();
!             }
!         };
!         super.doGet(delegatingReq, res);
!     }
  }
--- 1,330 ----
! /**
!  * Copyright 2005 The Apache Software Foundation
   *
!  * Licensed under the Apache License, Version 2.0 (the "License");
!  * you may not use this file except in compliance with the License.
!  * You may obtain a copy of the License at
   *
!  *     http://www.apache.org/licenses/LICENSE-2.0
   *
!  * Unless required by applicable law or agreed to in writing, software
!  * distributed under the License is distributed on an "AS IS" BASIS,
!  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
!  * See the License for the specific language governing permissions and
!  * limitations under the License.
   */
+ 
+ // Changed package name by St.Ack
  package org.archive.access.nutch;

! // Added by St.Ack.
! import org.apache.nutch.searcher.NutchBean;
! import org.apache.nutch.searcher.Query;
! import org.apache.nutch.searcher.HitDetails;
! import org.apache.nutch.searcher.Hit;
! import org.apache.nutch.searcher.Hits;
! 
  import java.io.IOException;
! import java.net.URLEncoder;
! import java.util.logging.Level;
  import java.util.Map;
+ import java.util.HashMap;
+ import java.util.Set;
+ import java.util.HashSet;

  import javax.servlet.ServletException;
! import javax.servlet.ServletConfig;
! import javax.servlet.http.HttpServlet;
  import javax.servlet.http.HttpServletRequest;
  import javax.servlet.http.HttpServletResponse;

! import javax.xml.parsers.*;
! import org.w3c.dom.*;
! import javax.xml.transform.TransformerFactory;
! import javax.xml.transform.Transformer;
! import javax.xml.transform.dom.DOMSource;
! import javax.xml.transform.stream.StreamResult;

! /** Present search results using A9's OpenSearch extensions to RSS, plus a few
!  * Nutch-specific extensions.
!  * 
!  * This is the nutch version with filtering for bad xml characters and
!  * encoding of exacturl.  St.Ack 10/12/2005.
!  */   
! public class NutchwaxOpenSearchServlet extends HttpServlet {
!   private static final Map NS_MAP = new HashMap();

!   static {
!     NS_MAP.put("opensearch", "http://a9.com/-/spec/opensearchrss/1.0/");
!     NS_MAP.put("nutch", "http://www.nutch.org/opensearchrss/1.0/");
!   }

!   private static final Set SKIP_DETAILS = new HashSet();
!   static {
!     SKIP_DETAILS.add("url");                   // redundant with RSS link
!     SKIP_DETAILS.add("title");                 // redundant with RSS title
!   }

!   private NutchBean bean;

!   public void init(ServletConfig config) throws ServletException {
!     try {
!       bean = NutchBean.get(config.getServletContext());
!     } catch (IOException e) {
!       throw new ServletException(e);
!     }
!   }

!   public void doGet(HttpServletRequest request, HttpServletResponse response)
!     throws ServletException, IOException {

!     NutchBean.LOG.info("query request from " + request.getRemoteAddr());

!     // get parameters from request
!     request.setCharacterEncoding("UTF-8");
!     String queryString = request.getParameter("query");
!     if (queryString == null)
!       queryString = "";

!     // Do exacturl encoding. Added by St.Ack
!     queryString = NutchwaxQuery.encodeExacturl(queryString);
!     String urlQuery = URLEncoder.encode(queryString, "UTF-8");

!     int start = 0;                                // first hit to display
!     String startString = request.getParameter("start");
!     if (startString != null)
!       start = Integer.parseInt(startString);
!     
!     int hitsPerPage = 10;                         // number of hits to display
!     String hitsString = request.getParameter("hitsPerPage");
!     if (hitsString != null)
!       hitsPerPage = Integer.parseInt(hitsString);

!     String sort = request.getParameter("sort");
!     boolean reverse =
!       sort!=null && "true".equals(request.getParameter("reverse"));

!     // De-Duplicate handling.  Look for duplicates field and for how many
!     // duplicates per results to return. Default duplicates field is 'site'
!     // and duplicates per results default is '2'.
!     String dedupField = request.getParameter("dedupField");
!     if (dedupField == null || dedupField.length() == 0) {
!         dedupField = "site";
!     }
!     int hitsPerDup = 2;
!     String hitsPerDupString = request.getParameter("hitsPerDup");
!     if (hitsPerDupString != null && hitsPerDupString.length() > 0) {
!         hitsPerDup = Integer.parseInt(hitsPerDupString);
!     } else {
!         // If 'hitsPerSite' present, use that value.
!         String hitsPerSiteString = request.getParameter("hitsPerSite");
!         if (hitsPerSiteString != null && hitsPerSiteString.length() > 0) {
!             hitsPerDup = Integer.parseInt(hitsPerSiteString);
!         }
!     }
!      
!     // Make up query string for use later drawing the 'rss' logo.
!     String params = "&hitsPerPage=" + hitsPerPage +
!         (sort == null ? "" : "&sort=" + sort + (reverse? "&reverse=true": "") +
!         (dedupField == null ? "" : "&dedupField=" + dedupField));

!     Query query = Query.parse(queryString);
!     NutchBean.LOG.info("query: " + queryString);

!     // execute the query
!     Hits hits;
!     try {
!       hits = bean.search(query, start + hitsPerPage, hitsPerDup, dedupField,
!           sort, reverse);
!     } catch (IOException e) {
!       NutchBean.LOG.log(Level.WARNING, "Search Error", e);
!       hits = new Hits(0,new Hit[0]);	
!     }

!     NutchBean.LOG.info("total hits: " + hits.getTotal());

!     // generate xml results
!     int end = (int)Math.min(hits.getLength(), start + hitsPerPage);
!     int length = end-start;

!     Hit[] show = hits.getHits(start, end-start);
!     HitDetails[] details = bean.getDetails(show);
!     String[] summaries = bean.getSummary(details, query);

!     String requestUrl = request.getRequestURL().toString();
!     String base = requestUrl.substring(0, requestUrl.lastIndexOf('/'));
!       

!     try {
!       DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
!       factory.setNamespaceAware(true);
!       Document doc = factory.newDocumentBuilder().newDocument();
!  
!       Element rss = addNode(doc, doc, "rss");
!       addAttribute(doc, rss, "version", "2.0");
!       addAttribute(doc, rss, "xmlns:opensearch",
!                    (String)NS_MAP.get("opensearch"));
!       addAttribute(doc, rss, "xmlns:nutch", (String)NS_MAP.get("nutch"));

!       Element channel = addNode(doc, rss, "channel");
!     
!       addNode(doc, channel, "title", "Nutch: " + queryString);
!       addNode(doc, channel, "description", "Nutch search results for query: "
!               + queryString);
!       addNode(doc, channel, "link",
!               base+"/search.jsp"
!               +"?query="+urlQuery
!               +"&start="+start
!               +"&hitsPerDup="+hitsPerDup
!               +params);

!       addNode(doc, channel, "opensearch", "totalResults", ""+hits.getTotal());
!       addNode(doc, channel, "opensearch", "startIndex", ""+start);
!       addNode(doc, channel, "opensearch", "itemsPerPage", ""+hitsPerPage);

!       addNode(doc, channel, "nutch", "query", queryString);
!     

!       if ((hits.totalIsExact() && end < hits.getTotal()) // more hits to show
!           || (!hits.totalIsExact() && (hits.getLength() > start+hitsPerPage))){
!         addNode(doc, channel, "nutch", "nextPage", requestUrl
!                 +"?query="+urlQuery
!                 +"&start="+end
!                 +"&hitsPerDup="+hitsPerDup
!                 +params);
!       }

!       if ((!hits.totalIsExact() && (hits.getLength() <= start+hitsPerPage))) {
!         addNode(doc, channel, "nutch", "showAllHits", requestUrl
!                 +"?query="+urlQuery
!                 +"&hitsPerDup="+0
!                 +params);
!       }

!       for (int i = 0; i < length; i++) {
!         Hit hit = show[i];
!         HitDetails detail = details[i];
!         String title = detail.getValue("title");
!         String url = detail.getValue("url");
!         String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getIndexDocNo();
!       
!         if (title == null || title.equals(""))    // use url for docs w/o title
!           title = url;

!         Element item = addNode(doc, channel, "item");

!         addNode(doc, item, "title", title);
!         addNode(doc, item, "description", summaries[i]);
!         addNode(doc, item, "link", url);

!         addNode(doc, item, "nutch", "site", hit.getDedupValue());

!         addNode(doc, item, "nutch", "cache", base+"/cached.jsp?"+id);
!         addNode(doc, item, "nutch", "explain", base+"/explain.jsp?"+id
!                 +"&query="+urlQuery);

!         if (hit.moreFromDupExcluded()) {
!           addNode(doc, item, "nutch", "moreFromSite", requestUrl
!                   +"?query="
!                   +URLEncoder.encode("site:"+hit.getDedupValue()
!                                      +" "+queryString, "UTF-8")
!                   +"&hitsPerSite="+0
!                   +params);
!         }

!         for (int j = 0; j < detail.getLength(); j++) { // add all from detail
!           String field = detail.getField(j);
!           if (!SKIP_DETAILS.contains(field))
!             addNode(doc, item, "nutch", field, detail.getValue(j));
!         }
!       }

!       // dump DOM tree

!       DOMSource source = new DOMSource(doc);
!       TransformerFactory transFactory = TransformerFactory.newInstance();
!       Transformer transformer = transFactory.newTransformer();
!       transformer.setOutputProperty("indent", "yes");
!       StreamResult result = new StreamResult(response.getOutputStream());
!       response.setContentType("text/xml");
!       transformer.transform(source, result);

!     } catch (javax.xml.parsers.ParserConfigurationException e) {
!       throw new ServletException(e);
!     } catch (javax.xml.transform.TransformerException e) {
!       throw new ServletException(e);
!     }
!       
!   }

!   private static Element addNode(Document doc, Node parent, String name) {
!     Element child = doc.createElement(name);
!     parent.appendChild(child);
!     return child;
!   }

!   private static void addNode(Document doc, Node parent,
!                               String name, String text) {
!     Element child = doc.createElement(name);
!     child.appendChild(doc.createTextNode(getLegalXml(text)));
!     parent.appendChild(child);
!   }

!   private static void addNode(Document doc, Node parent,
!                               String ns, String name, String text) {
!     Element child = doc.createElementNS((String)NS_MAP.get(ns), ns+":"+name);
!     child.appendChild(doc.createTextNode(getLegalXml(text)));
!     parent.appendChild(child);
!   }

!   private static void addAttribute(Document doc, Element node,
!                                    String name, String value) {
!     Attr attribute = doc.createAttribute(name);
!     attribute.setValue(getLegalXml(value));
!     node.getAttributes().setNamedItem(attribute);
!   }

!   /*
!    * Ensure string is legal xml.
!    * First look to see if string has illegal characters.  If it doesn't,
!    * just return it.  Otherwise, create new string with illegal characters
!    * @param text String to verify.
!    * @return Passed <code>text</code> or a new string with illegal
!    * characters removed if any found in <code>text</code>.
!    * @see http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char
!    */
!   private static String getLegalXml(final String text) {
!       if (text == null) {
!           return null;
!       }
!       boolean allLegal = true;
!       for (int i = 0; i < text.length(); i++) {
!         if (!isLegalXml(text.charAt(i))) {
!             allLegal = false; 
!             break;
!         }
!       }
!       return allLegal? text: createLegalXml(text);
!   }

!   private static String createLegalXml(final String text) {
!       if (text == null) {
!           return null;
!       }
!       StringBuffer buffer = new StringBuffer(text.length());
!       for (int i = 0; i < text.length(); i++) {
!         char c = text.charAt(i);
!         if (isLegalXml(c)) {
!           buffer.append(c);
!          }
!       }
!       return buffer.toString();
!   }
!   
!   private static boolean isLegalXml(final char c) {
!     return c == 0x9 || c == 0xa || c == 0xd || (c >= 0x20 && c <= 0xd7ff)
!         || (c >= 0xe000 && c <= 0xfffd) || (c >= 0x10000 && c <= 0x10ffff);
!   }
  }
+ 

[Archive-access-cvs] archive-access/projects/nutch/src/java/org/archive/access/nutch NutchwaxOpenSea

[Archive-access-cvs] archive-access/projects/nutch/src/java/org/archive/access/nutch NutchwaxOpenSearchServlet.java,1.3,1.4