Revision: 1789 http://archive-access.svn.sourceforge.net/archive-access/?rev=1789&view=rev Author: bradtofel Date: 2007-07-16 16:23:42 -0700 (Mon, 16 Jul 2007) Log Message: ----------- TWEAK: now aware of UrlSearchResults Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/NutchResourceIndex.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/NutchResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/NutchResourceIndex.java 2007-07-16 23:22:43 UTC (rev 1788) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/NutchResourceIndex.java 2007-07-16 23:23:42 UTC (rev 1789) @@ -41,6 +41,7 @@ import org.archive.wayback.core.SearchResult; import org.archive.wayback.core.SearchResults; import org.archive.wayback.core.Timestamp; +import org.archive.wayback.core.UrlSearchResults; import org.archive.wayback.core.WaybackRequest; import org.archive.wayback.exception.AccessControlException; import org.archive.wayback.exception.BadQueryException; @@ -142,7 +143,7 @@ e.getMessage()); } - SearchResults results = new SearchResults(); + SearchResults results = new UrlSearchResults(); NodeList channel = getSearchChannel(document); NodeList nodes = getSearchItems(document); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2226 http://archive-access.svn.sourceforge.net/archive-access/?rev=2226&view=rev Author: bradtofel Date: 2008-04-10 20:48:48 -0700 (Thu, 10 Apr 2008) Log Message: ----------- BUGFIX: ACC-20 : sending correct endDateStr to remote Nutch index. WHITESPACE & COMMENT changes. INTERFACE: added shutdown() method for new ResourceIndex interface. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/NutchResourceIndex.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/NutchResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/NutchResourceIndex.java 2008-04-01 02:00:10 UTC (rev 2225) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/NutchResourceIndex.java 2008-04-11 03:48:48 UTC (rev 2226) @@ -96,7 +96,6 @@ LOGGER.info("initializing NutchResourceIndex..."); LOGGER.info("Using base search url " + this.searchUrlBase); -// this.factory = DocumentBuilderFactory.newInstance(); this.factory.setNamespaceAware(true); try { this.builder = this.factory.newDocumentBuilder(); @@ -249,32 +248,29 @@ protected String getRequestUrl(WaybackRequest wbRequest) throws BadQueryException { -// final String urlStr, final String date, -// } -// final int count) -// throws IOException { - String urlStr = wbRequest.get(WaybackConstants.REQUEST_URL); - String exactDateStr = wbRequest.get(WaybackConstants.REQUEST_EXACT_DATE); - if (exactDateStr != null && exactDateStr.length() == 0) { - exactDateStr = null; - } - String endDateStr = wbRequest.get(WaybackConstants.REQUEST_END_DATE); - if (endDateStr == null || endDateStr.length() == 0) { - endDateStr = wbRequest.get(WaybackConstants.REQUEST_DATE); - } - String startDateStr = wbRequest.get(WaybackConstants.REQUEST_START_DATE); - if (startDateStr == null || startDateStr.length() == 0) { - startDateStr = Timestamp.earliestTimestamp().getDateStr(); - } - int hitsPerPage = wbRequest.getResultsPerPage(); - if(hitsPerPage < 1) { - throw new BadQueryException("Hits per page must be positive"); - } - if(hitsPerPage > maxRecords) { - throw new BadQueryException("Hits per page must be less than " + - maxRecords); - } - int start = (wbRequest.getPageNum()-1) * hitsPerPage; + + String urlStr = wbRequest.get(WaybackConstants.REQUEST_URL); + String exactDateStr = wbRequest.get(WaybackConstants.REQUEST_EXACT_DATE); + if (exactDateStr != null && exactDateStr.length() == 0) { + exactDateStr = null; + } + String endDateStr = wbRequest.get(WaybackConstants.REQUEST_END_DATE); + if (endDateStr == null || endDateStr.length() == 0) { + endDateStr = Timestamp.latestTimestamp().getDateStr(); + } + String startDateStr = wbRequest.get(WaybackConstants.REQUEST_START_DATE); + if (startDateStr == null || startDateStr.length() == 0) { + startDateStr = Timestamp.earliestTimestamp().getDateStr(); + } + int hitsPerPage = wbRequest.getResultsPerPage(); + if(hitsPerPage < 1) { + throw new BadQueryException("Hits per page must be positive"); + } + if(hitsPerPage > maxRecords) { + throw new BadQueryException("Hits per page must be less than " + + maxRecords); + } + int start = (wbRequest.getPageNum()-1) * hitsPerPage; if (urlStr == null || urlStr.length() <= 0) { throw new BadQueryException("Url is empty."); } @@ -282,16 +278,8 @@ MutableString ms = new MutableString(this.searchUrlBase) .append("?query="); // Add 'date:...+' to query string. - // As searching for exact dates is not what we want in most cases, - // we will only use this if REQUEST_[END_]DATE is empty; - if ((endDateStr == null || endDateStr.length() == 0) - && exactDateStr != null && exactDateStr.length() > 0) { - ms.append("date%3A").append(exactDateStr).append('+'); - - } else { - ms.append("date%3A").append(startDateStr).append('-').append( - exactDateStr != null ? exactDateStr : endDateStr).append('+'); - } + ms.append("date%3A").append(startDateStr).append('-').append(endDateStr); + ms.append('+'); // Add 'url:URL'. if(wbRequest.get(WaybackConstants.REQUEST_TYPE).equals( WaybackConstants.REQUEST_URL_PREFIX_QUERY)) { @@ -303,10 +291,6 @@ } catch (UnsupportedEncodingException e) { throw new BadQueryException(e.toString()); } - // when searching for exacturl, we are mostly - // interested in the different versions over the time -// ms.append("&sort=date"); -// ms.append("&reverse=true"); } ms.append("&hitsPerPage=").append(hitsPerPage); ms.append("&start=").append(start); @@ -348,28 +332,31 @@ d = this.builder.parse(url); return d; } -/** - * @return the searchUrlBase - */ -public String getSearchUrlBase() { - return searchUrlBase; + /** + * @return the searchUrlBase + */ + public String getSearchUrlBase() { + return searchUrlBase; + } + /** + * @param searchUrlBase the searchUrlBase to set + */ + public void setSearchUrlBase(String searchUrlBase) { + this.searchUrlBase = searchUrlBase; + } + /** + * @return the maxRecords + */ + public int getMaxRecords() { + return maxRecords; + } + /** + * @param maxRecords the maxRecords to set + */ + public void setMaxRecords(int maxRecords) { + this.maxRecords = maxRecords; + } + public void shutdown() throws IOException { + + } } -/** - * @param searchUrlBase the searchUrlBase to set - */ -public void setSearchUrlBase(String searchUrlBase) { - this.searchUrlBase = searchUrlBase; -} -/** - * @return the maxRecords - */ -public int getMaxRecords() { - return maxRecords; -} -/** - * @param maxRecords the maxRecords to set - */ -public void setMaxRecords(int maxRecords) { - this.maxRecords = maxRecords; -} -} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2421 http://archive-access.svn.sourceforge.net/archive-access/?rev=2421&view=rev Author: bradtofel Date: 2008-07-08 16:45:08 -0700 (Tue, 08 Jul 2008) Log Message: ----------- INTERFACE: now works with NutchWax 0.12.1 xml results. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/NutchResourceIndex.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/NutchResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/NutchResourceIndex.java 2008-07-08 22:03:16 UTC (rev 2420) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/NutchResourceIndex.java 2008-07-08 23:45:08 UTC (rev 2421) @@ -28,6 +28,8 @@ import java.io.IOException; import java.io.UnsupportedEncodingException; +import java.util.ArrayList; +import java.util.List; import java.util.logging.Logger; import javax.xml.parsers.DocumentBuilder; @@ -71,11 +73,14 @@ private DocumentBuilder builder; private static final String NUTCH_ARCNAME = "arcname"; private static final String NUTCH_ARCOFFSET = "arcoffset"; - private static final String NUTCH_ARCDATE = "tstamp"; - private static final String NUTCH_ARCDATE_ALT = "arcdate"; +// private static final String NUTCH_FILENAME = "filename"; +// private static final String NUTCH_FILEOFFSET = "fileoffset"; + private static final String NUTCH_ARCDATE = "date"; +// private static final String NUTCH_ARCDATE_ALT = "arcdate"; private static final String NUTCH_DIGEST = "digest"; - private static final String NUTCH_PRIMARY_TYPE = "primaryType"; - private static final String NUTCH_SUB_TYPE = "subType"; + private static final String NUTCH_MIME_TYPE = "type"; +// private static final String NUTCH_PRIMARY_TYPE = "primaryType"; +// private static final String NUTCH_SUB_TYPE = "subType"; // private static final String NUTCH_CAPTURE_HOST = "site"; private static final String NUTCH_CAPTURE_URL = "link"; @@ -117,6 +122,7 @@ Document document = null; try { // HTTP Request + parse + LOGGER.info("Requesting OpenSearch: " + requestUrl); document = getHttpDocument(requestUrl); } catch (IOException e) { // TODO: better error for user: @@ -132,8 +138,10 @@ if(wbRequest.isReplayRequest() || wbRequest.isCaptureQueryRequest()) { results = new CaptureSearchResults(); } else { - // TODO: this is wrong, but needs exploration into what NutchWax can actually do. - throw new BadQueryException("Unable to perform path prefix requests with this index type"); + // TODO: this is wrong, but needs exploration into what NutchWax + // can actually do. + throw new BadQueryException("Unable to perform path " + + "prefix requests with this index type"); } NodeList channel = getSearchChannel(document); NodeList nodes = getSearchItems(document); @@ -154,8 +162,12 @@ Element e = (Element) nodes.item(i); - CaptureSearchResult result = elementToSearchResult(e); - results.addSearchResult(result); + List<CaptureSearchResult> resultsList = itemToSearchResults(e); + if(resultsList != null) { + for(CaptureSearchResult result : resultsList) { + results.addSearchResult(result); + } + } } Element channelElement = (Element) channel.item(0); @@ -179,42 +191,45 @@ return results; } - private CaptureSearchResult elementToSearchResult(Element e) + private List<CaptureSearchResult> itemToSearchResults(Element e) throws ResourceIndexNotAvailableException { - CaptureSearchResult result = new CaptureSearchResult(); + String fileName = getNodeNutchContent(e,NUTCH_ARCNAME); + String httpCode = NUTCH_DEFAULT_HTTP_CODE; + String digest = getNodeNutchContent(e,NUTCH_DIGEST); + String mimeType = getNodeNutchContent(e,NUTCH_MIME_TYPE); + String offsetStr = getNodeNutchContent(e,NUTCH_ARCOFFSET); + long offset = 0; + if(offsetStr != null && offsetStr.length() > 0) { + offset = Long.parseLong(offsetStr); + } + String redirectUrl = NUTCH_DEFAULT_REDIRECT_URL; + String originalUrl = getNodeContent(e,NUTCH_CAPTURE_URL); + String urlKey = originalUrl; + + NodeList nodes = e.getElementsByTagNameNS(NUTCH_NS,NUTCH_ARCDATE); + int numDates = nodes.getLength(); + ArrayList<CaptureSearchResult> results = null; - result.setFile(getNodeNutchContent(e,NUTCH_ARCNAME)); + if(numDates > 0) { + results = new ArrayList<CaptureSearchResult>(); - // The date in nutchwax is now named 'tstamp' and its - // 17 characters rather than 14. Pass first 14 only. - String d = getNodeNutchContent(e,NUTCH_ARCDATE); - if(d == null) { - d = getNodeNutchContent(e,NUTCH_ARCDATE_ALT); - } - if(d == null) { - throw new ResourceIndexNotAvailableException("Missing arcdate field in search results"); - } - if (d.length() == 17) { - d = d.substring(0, 14); - } - result.setCaptureTimestamp(d); - - //result.put(WaybackConstants.RESULT_HTTP_CODE,getNodeContent(e,"")); - result.setHttpCode(NUTCH_DEFAULT_HTTP_CODE); - result.setDigest(getNodeNutchContent(e,NUTCH_DIGEST)); - - result.setMimeType(getNodeNutchContent(e,NUTCH_PRIMARY_TYPE) + "/" + - getNodeNutchContent(e,NUTCH_SUB_TYPE)); - - result.setOffset(Long.parseLong(getNodeNutchContent(e,NUTCH_ARCOFFSET))); - - result.setRedirectUrl(NUTCH_DEFAULT_REDIRECT_URL); - result.setCaptureTimestamp(getNodeContent(e,NUTCH_CAPTURE_URL)); - result.setOriginalUrl(getNodeContent(e,NUTCH_CAPTURE_URL)); - result.setUrlKey(getNodeContent(e,NUTCH_CAPTURE_URL)); - - return result; + for(int i = 0; i < numDates; i++) { + String captureDate = nodes.item(i).getTextContent(); + CaptureSearchResult result = new CaptureSearchResult(); + result.setFile(fileName); + result.setCaptureTimestamp(captureDate); + result.setHttpCode(httpCode); + result.setDigest(digest); + result.setMimeType(mimeType); + result.setOffset(offset); + result.setRedirectUrl(redirectUrl); + result.setOriginalUrl(originalUrl); + result.setUrlKey(urlKey); + results.add(result); + } + } + return results; } protected NodeList getSearchChannel(Document d) { @@ -271,13 +286,13 @@ ms.append("date%3A").append(startDateStr).append('-').append(endDateStr); ms.append('+'); // Add 'url:URL'. - if(wbRequest.isUrlQueryRequest()) { +// if(wbRequest.isUrlQueryRequest()) { ms.append("url%3A"); - } else { - ms.append("exacturl%3A"); - } +// } else { +// ms.append("exacturl%3A"); +// } try { - ms.append(java.net.URLEncoder.encode(urlStr, "UTF-8")); + ms.append(java.net.URLEncoder.encode("\""+urlStr+"\"", "UTF-8")); } catch (UnsupportedEncodingException e) { throw new BadQueryException(e.toString()); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |