From: Michael S. <sta...@us...> - 2005-10-13 15:53:49
|
Update of /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv16586/src/java/org/archive/access/nutch Modified Files: NutchwaxOpenSearchServlet.java Log Message: Fix for ' 1312212 ] bad xml chars in search results' * src/java/org/archive/access/nutch/NutchwaxOpenSearchServlet.java I didn't want to bring into nutchwax a complete copy of OpenSearchServlet but have no choice if I want to fix bad xml bug. Have submitted patch to nutch. If it gets applied I'll remove the inclusion of the total servlet. Meantime, the below runs all text through a filter that looks for disallowed xml characters. Index: NutchwaxOpenSearchServlet.java =================================================================== RCS file: /cvsroot/archive-access/archive-access/projects/nutch/src/java/org/archive/access/nutch/NutchwaxOpenSearchServlet.java,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** NutchwaxOpenSearchServlet.java 6 Oct 2005 17:35:02 -0000 1.3 --- NutchwaxOpenSearchServlet.java 13 Oct 2005 15:53:36 -0000 1.4 *************** *** 1,274 **** ! /* NutchwaxOpenSearchServlet.java * ! * $Id$ * ! * Created Jul 26, 2005 * ! * Copyright (C) 2005 Internet Archive. ! * ! * This file is part of the archive-access tools project ! * (http://sourceforge.net/projects/archive-access). ! * ! * The archive-access tools are free software; you can redistribute them and/or ! * modify them under the terms of the GNU Lesser Public License as published by ! * the Free Software Foundation; either version 2.1 of the License, or any ! * later version. ! * ! * The archive-access tools are distributed in the hope that they will be ! * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of ! * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser ! * Public License for more details. ! * ! * You should have received a copy of the GNU Lesser Public License along with ! * the archive-access tools; if not, write to the Free Software Foundation, ! * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package org.archive.access.nutch; ! import java.io.BufferedReader; import java.io.IOException; ! import java.io.UnsupportedEncodingException; ! import java.security.Principal; ! import java.util.Enumeration; ! import java.util.Locale; import java.util.Map; - import javax.servlet.RequestDispatcher; import javax.servlet.ServletException; ! import javax.servlet.ServletInputStream; ! import javax.servlet.http.Cookie; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; - import javax.servlet.http.HttpSession; - - import org.apache.nutch.searcher.OpenSearchServlet; - - public class NutchwaxOpenSearchServlet extends OpenSearchServlet { - public void doGet(final HttpServletRequest req, - final HttpServletResponse res) - throws ServletException, IOException { - // Make a delegating method that preprocesses the query string - // converting any exacturl values so they'll pass the NutchAnalysis. - HttpServletRequest delegatingReq = new HttpServletRequest() { - public String getParameter(String parameter) { - String q = req.getParameter(parameter); - return (parameter != null && parameter.equals("query"))? - NutchwaxQuery.encodeExacturl(q): q; - } - - public String getAuthType() { - return req.getAuthType(); - } - - public Cookie[] getCookies() { - return req.getCookies(); - } - - public long getDateHeader(String arg0) { - return req.getDateHeader(arg0); - } - - public String getHeader(String arg0) { - return req.getHeader(arg0); - } - - public Enumeration getHeaders(String arg0) { - return req.getHeaders(arg0); - } - - public Enumeration getHeaderNames() { - return req.getHeaderNames(); - } - - public int getIntHeader(String arg0) { - return req.getIntHeader(arg0); - } - - public String getMethod() { - return req.getMethod(); - } - - public String getPathInfo() { - return req.getPathInfo(); - } ! public String getPathTranslated() { ! return req.getPathTranslated(); ! } ! ! public String getContextPath() { ! return req.getContextPath(); ! } ! ! public String getQueryString() { ! return req.getQueryString(); ! } - public String getRemoteUser() { - return req.getRemoteUser(); - } ! public boolean isUserInRole(String arg0) { ! return req.isUserInRole(arg0); ! } ! public Principal getUserPrincipal() { ! return req.getUserPrincipal(); ! } ! public String getRequestedSessionId() { ! return req.getRequestedSessionId(); ! } ! public String getRequestURI() { ! return req.getRequestURI(); ! } ! public StringBuffer getRequestURL() { ! return req.getRequestURL(); ! } ! public String getServletPath() { ! return req.getServletPath(); ! } ! public HttpSession getSession(boolean arg0) { ! return req.getSession(arg0); ! } ! public HttpSession getSession() { ! return req.getSession(); ! } ! public boolean isRequestedSessionIdValid() { ! return req.isRequestedSessionIdValid(); ! } ! public boolean isRequestedSessionIdFromCookie() { ! return req.isRequestedSessionIdFromCookie(); ! } ! public boolean isRequestedSessionIdFromURL() { ! return req.isRequestedSessionIdFromURL(); ! } ! public boolean isRequestedSessionIdFromUrl() { ! return req.isRequestedSessionIdFromUrl(); ! } ! public Object getAttribute(String arg0) { ! return req.getAttribute(arg0); ! } ! public Enumeration getAttributeNames() { ! return req.getAttributeNames(); ! } ! public String getCharacterEncoding() { ! return req.getCharacterEncoding(); ! } ! public void setCharacterEncoding(String arg0) ! throws UnsupportedEncodingException { ! req.setCharacterEncoding(arg0); ! } ! public int getContentLength() { ! return req.getContentLength(); ! } ! public String getContentType() { ! return req.getContentType(); ! } ! public ServletInputStream getInputStream() throws IOException { ! return req.getInputStream(); ! } ! public Enumeration getParameterNames() { ! return req.getParameterNames(); ! } ! public String[] getParameterValues(String arg0) { ! return req.getParameterValues(arg0); ! } ! public Map getParameterMap() { ! return req.getParameterMap(); ! } ! public String getProtocol() { ! return req.getProtocol(); ! } ! public String getScheme() { ! return req.getScheme(); ! } ! public String getServerName() { ! return req.getServerName(); ! } ! public int getServerPort() { ! return req.getServerPort(); ! } ! public BufferedReader getReader() throws IOException { ! return req.getReader(); ! } ! public String getRemoteAddr() { ! return req.getRemoteAddr(); ! } ! public String getRemoteHost() { ! return req.getRemoteHost(); ! } ! public void setAttribute(String arg0, Object arg1) { ! req.setAttribute(arg0, arg1); ! } ! public void removeAttribute(String arg0) { ! req.removeAttribute(arg0); ! } ! public Locale getLocale() { ! return req.getLocale(); ! } ! public Enumeration getLocales() { ! return req.getLocales(); ! } ! public boolean isSecure() { ! return req.isSecure(); ! } ! public RequestDispatcher getRequestDispatcher(String arg0) { ! return req.getRequestDispatcher(arg0); ! } ! public String getRealPath(String arg0) { ! return req.getRealPath(arg0); ! } ! public int getRemotePort() { ! return req.getRemotePort(); ! } ! public String getLocalName() { ! return req.getLocalName(); ! } ! public String getLocalAddr() { ! return req.getLocalAddr(); ! } ! public int getLocalPort() { ! return req.getLocalPort(); ! } ! }; ! super.doGet(delegatingReq, res); ! } } --- 1,330 ---- ! /** ! * Copyright 2005 The Apache Software Foundation * ! * Licensed under the Apache License, Version 2.0 (the "License"); ! * you may not use this file except in compliance with the License. ! * You may obtain a copy of the License at * ! * http://www.apache.org/licenses/LICENSE-2.0 * ! * Unless required by applicable law or agreed to in writing, software ! * distributed under the License is distributed on an "AS IS" BASIS, ! * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ! * See the License for the specific language governing permissions and ! * limitations under the License. */ + + // Changed package name by St.Ack package org.archive.access.nutch; ! // Added by St.Ack. ! import org.apache.nutch.searcher.NutchBean; ! import org.apache.nutch.searcher.Query; ! import org.apache.nutch.searcher.HitDetails; ! import org.apache.nutch.searcher.Hit; ! import org.apache.nutch.searcher.Hits; ! import java.io.IOException; ! import java.net.URLEncoder; ! import java.util.logging.Level; import java.util.Map; + import java.util.HashMap; + import java.util.Set; + import java.util.HashSet; import javax.servlet.ServletException; ! import javax.servlet.ServletConfig; ! import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; ! import javax.xml.parsers.*; ! import org.w3c.dom.*; ! import javax.xml.transform.TransformerFactory; ! import javax.xml.transform.Transformer; ! import javax.xml.transform.dom.DOMSource; ! import javax.xml.transform.stream.StreamResult; ! /** Present search results using A9's OpenSearch extensions to RSS, plus a few ! * Nutch-specific extensions. ! * ! * This is the nutch version with filtering for bad xml characters and ! * encoding of exacturl. St.Ack 10/12/2005. ! */ ! public class NutchwaxOpenSearchServlet extends HttpServlet { ! private static final Map NS_MAP = new HashMap(); ! static { ! NS_MAP.put("opensearch", "http://a9.com/-/spec/opensearchrss/1.0/"); ! NS_MAP.put("nutch", "http://www.nutch.org/opensearchrss/1.0/"); ! } ! private static final Set SKIP_DETAILS = new HashSet(); ! static { ! SKIP_DETAILS.add("url"); // redundant with RSS link ! SKIP_DETAILS.add("title"); // redundant with RSS title ! } ! private NutchBean bean; ! public void init(ServletConfig config) throws ServletException { ! try { ! bean = NutchBean.get(config.getServletContext()); ! } catch (IOException e) { ! throw new ServletException(e); ! } ! } ! public void doGet(HttpServletRequest request, HttpServletResponse response) ! throws ServletException, IOException { ! NutchBean.LOG.info("query request from " + request.getRemoteAddr()); ! // get parameters from request ! request.setCharacterEncoding("UTF-8"); ! String queryString = request.getParameter("query"); ! if (queryString == null) ! queryString = ""; ! // Do exacturl encoding. Added by St.Ack ! queryString = NutchwaxQuery.encodeExacturl(queryString); ! String urlQuery = URLEncoder.encode(queryString, "UTF-8"); ! int start = 0; // first hit to display ! String startString = request.getParameter("start"); ! if (startString != null) ! start = Integer.parseInt(startString); ! ! int hitsPerPage = 10; // number of hits to display ! String hitsString = request.getParameter("hitsPerPage"); ! if (hitsString != null) ! hitsPerPage = Integer.parseInt(hitsString); ! String sort = request.getParameter("sort"); ! boolean reverse = ! sort!=null && "true".equals(request.getParameter("reverse")); ! // De-Duplicate handling. Look for duplicates field and for how many ! // duplicates per results to return. Default duplicates field is 'site' ! // and duplicates per results default is '2'. ! String dedupField = request.getParameter("dedupField"); ! if (dedupField == null || dedupField.length() == 0) { ! dedupField = "site"; ! } ! int hitsPerDup = 2; ! String hitsPerDupString = request.getParameter("hitsPerDup"); ! if (hitsPerDupString != null && hitsPerDupString.length() > 0) { ! hitsPerDup = Integer.parseInt(hitsPerDupString); ! } else { ! // If 'hitsPerSite' present, use that value. ! String hitsPerSiteString = request.getParameter("hitsPerSite"); ! if (hitsPerSiteString != null && hitsPerSiteString.length() > 0) { ! hitsPerDup = Integer.parseInt(hitsPerSiteString); ! } ! } ! ! // Make up query string for use later drawing the 'rss' logo. ! String params = "&hitsPerPage=" + hitsPerPage + ! (sort == null ? "" : "&sort=" + sort + (reverse? "&reverse=true": "") + ! (dedupField == null ? "" : "&dedupField=" + dedupField)); ! Query query = Query.parse(queryString); ! NutchBean.LOG.info("query: " + queryString); ! // execute the query ! Hits hits; ! try { ! hits = bean.search(query, start + hitsPerPage, hitsPerDup, dedupField, ! sort, reverse); ! } catch (IOException e) { ! NutchBean.LOG.log(Level.WARNING, "Search Error", e); ! hits = new Hits(0,new Hit[0]); ! } ! NutchBean.LOG.info("total hits: " + hits.getTotal()); ! // generate xml results ! int end = (int)Math.min(hits.getLength(), start + hitsPerPage); ! int length = end-start; ! Hit[] show = hits.getHits(start, end-start); ! HitDetails[] details = bean.getDetails(show); ! String[] summaries = bean.getSummary(details, query); ! String requestUrl = request.getRequestURL().toString(); ! String base = requestUrl.substring(0, requestUrl.lastIndexOf('/')); ! ! try { ! DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); ! factory.setNamespaceAware(true); ! Document doc = factory.newDocumentBuilder().newDocument(); ! ! Element rss = addNode(doc, doc, "rss"); ! addAttribute(doc, rss, "version", "2.0"); ! addAttribute(doc, rss, "xmlns:opensearch", ! (String)NS_MAP.get("opensearch")); ! addAttribute(doc, rss, "xmlns:nutch", (String)NS_MAP.get("nutch")); ! Element channel = addNode(doc, rss, "channel"); ! ! addNode(doc, channel, "title", "Nutch: " + queryString); ! addNode(doc, channel, "description", "Nutch search results for query: " ! + queryString); ! addNode(doc, channel, "link", ! base+"/search.jsp" ! +"?query="+urlQuery ! +"&start="+start ! +"&hitsPerDup="+hitsPerDup ! +params); ! addNode(doc, channel, "opensearch", "totalResults", ""+hits.getTotal()); ! addNode(doc, channel, "opensearch", "startIndex", ""+start); ! addNode(doc, channel, "opensearch", "itemsPerPage", ""+hitsPerPage); ! addNode(doc, channel, "nutch", "query", queryString); ! ! if ((hits.totalIsExact() && end < hits.getTotal()) // more hits to show ! || (!hits.totalIsExact() && (hits.getLength() > start+hitsPerPage))){ ! addNode(doc, channel, "nutch", "nextPage", requestUrl ! +"?query="+urlQuery ! +"&start="+end ! +"&hitsPerDup="+hitsPerDup ! +params); ! } ! if ((!hits.totalIsExact() && (hits.getLength() <= start+hitsPerPage))) { ! addNode(doc, channel, "nutch", "showAllHits", requestUrl ! +"?query="+urlQuery ! +"&hitsPerDup="+0 ! +params); ! } ! for (int i = 0; i < length; i++) { ! Hit hit = show[i]; ! HitDetails detail = details[i]; ! String title = detail.getValue("title"); ! String url = detail.getValue("url"); ! String id = "idx=" + hit.getIndexNo() + "&id=" + hit.getIndexDocNo(); ! ! if (title == null || title.equals("")) // use url for docs w/o title ! title = url; ! Element item = addNode(doc, channel, "item"); ! addNode(doc, item, "title", title); ! addNode(doc, item, "description", summaries[i]); ! addNode(doc, item, "link", url); ! addNode(doc, item, "nutch", "site", hit.getDedupValue()); ! addNode(doc, item, "nutch", "cache", base+"/cached.jsp?"+id); ! addNode(doc, item, "nutch", "explain", base+"/explain.jsp?"+id ! +"&query="+urlQuery); ! if (hit.moreFromDupExcluded()) { ! addNode(doc, item, "nutch", "moreFromSite", requestUrl ! +"?query=" ! +URLEncoder.encode("site:"+hit.getDedupValue() ! +" "+queryString, "UTF-8") ! +"&hitsPerSite="+0 ! +params); ! } ! for (int j = 0; j < detail.getLength(); j++) { // add all from detail ! String field = detail.getField(j); ! if (!SKIP_DETAILS.contains(field)) ! addNode(doc, item, "nutch", field, detail.getValue(j)); ! } ! } ! // dump DOM tree ! DOMSource source = new DOMSource(doc); ! TransformerFactory transFactory = TransformerFactory.newInstance(); ! Transformer transformer = transFactory.newTransformer(); ! transformer.setOutputProperty("indent", "yes"); ! StreamResult result = new StreamResult(response.getOutputStream()); ! response.setContentType("text/xml"); ! transformer.transform(source, result); ! } catch (javax.xml.parsers.ParserConfigurationException e) { ! throw new ServletException(e); ! } catch (javax.xml.transform.TransformerException e) { ! throw new ServletException(e); ! } ! ! } ! private static Element addNode(Document doc, Node parent, String name) { ! Element child = doc.createElement(name); ! parent.appendChild(child); ! return child; ! } ! private static void addNode(Document doc, Node parent, ! String name, String text) { ! Element child = doc.createElement(name); ! child.appendChild(doc.createTextNode(getLegalXml(text))); ! parent.appendChild(child); ! } ! private static void addNode(Document doc, Node parent, ! String ns, String name, String text) { ! Element child = doc.createElementNS((String)NS_MAP.get(ns), ns+":"+name); ! child.appendChild(doc.createTextNode(getLegalXml(text))); ! parent.appendChild(child); ! } ! private static void addAttribute(Document doc, Element node, ! String name, String value) { ! Attr attribute = doc.createAttribute(name); ! attribute.setValue(getLegalXml(value)); ! node.getAttributes().setNamedItem(attribute); ! } ! /* ! * Ensure string is legal xml. ! * First look to see if string has illegal characters. If it doesn't, ! * just return it. Otherwise, create new string with illegal characters ! * @param text String to verify. ! * @return Passed <code>text</code> or a new string with illegal ! * characters removed if any found in <code>text</code>. ! * @see http://www.w3.org/TR/2000/REC-xml-20001006#NT-Char ! */ ! private static String getLegalXml(final String text) { ! if (text == null) { ! return null; ! } ! boolean allLegal = true; ! for (int i = 0; i < text.length(); i++) { ! if (!isLegalXml(text.charAt(i))) { ! allLegal = false; ! break; ! } ! } ! return allLegal? text: createLegalXml(text); ! } ! private static String createLegalXml(final String text) { ! if (text == null) { ! return null; ! } ! StringBuffer buffer = new StringBuffer(text.length()); ! for (int i = 0; i < text.length(); i++) { ! char c = text.charAt(i); ! if (isLegalXml(c)) { ! buffer.append(c); ! } ! } ! return buffer.toString(); ! } ! ! private static boolean isLegalXml(final char c) { ! return c == 0x9 || c == 0xa || c == 0xd || (c >= 0x20 && c <= 0xd7ff) ! || (c >= 0xe000 && c <= 0xfffd) || (c >= 0x10000 && c <= 0x10ffff); ! } } + |