You can subscribe to this list here.
2005 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(1) |
Aug
(10) |
Sep
(36) |
Oct
(339) |
Nov
(103) |
Dec
(152) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2006 |
Jan
(141) |
Feb
(102) |
Mar
(125) |
Apr
(203) |
May
(57) |
Jun
(30) |
Jul
(139) |
Aug
(46) |
Sep
(64) |
Oct
(105) |
Nov
(34) |
Dec
(162) |
2007 |
Jan
(81) |
Feb
(57) |
Mar
(141) |
Apr
(72) |
May
(9) |
Jun
(1) |
Jul
(144) |
Aug
(88) |
Sep
(40) |
Oct
(43) |
Nov
(34) |
Dec
(20) |
2008 |
Jan
(44) |
Feb
(45) |
Mar
(16) |
Apr
(36) |
May
(8) |
Jun
(77) |
Jul
(177) |
Aug
(66) |
Sep
(8) |
Oct
(33) |
Nov
(13) |
Dec
(37) |
2009 |
Jan
(2) |
Feb
(5) |
Mar
(8) |
Apr
|
May
(36) |
Jun
(19) |
Jul
(46) |
Aug
(8) |
Sep
(1) |
Oct
(66) |
Nov
(61) |
Dec
(10) |
2010 |
Jan
(13) |
Feb
(16) |
Mar
(38) |
Apr
(76) |
May
(47) |
Jun
(32) |
Jul
(35) |
Aug
(45) |
Sep
(20) |
Oct
(61) |
Nov
(24) |
Dec
(16) |
2011 |
Jan
(22) |
Feb
(34) |
Mar
(11) |
Apr
(8) |
May
(24) |
Jun
(23) |
Jul
(11) |
Aug
(42) |
Sep
(81) |
Oct
(48) |
Nov
(21) |
Dec
(20) |
2012 |
Jan
(30) |
Feb
(25) |
Mar
(4) |
Apr
(6) |
May
(1) |
Jun
(5) |
Jul
(5) |
Aug
(8) |
Sep
(6) |
Oct
(6) |
Nov
|
Dec
|
From: <bra...@us...> - 2008-07-01 23:24:42
|
Revision: 2355 http://archive-access.svn.sourceforge.net/archive-access/?rev=2355&view=rev Author: bradtofel Date: 2008-07-01 16:24:50 -0700 (Tue, 01 Jul 2008) Log Message: ----------- REFACTOR: SearchResult => (Url|Capture)SearchResult Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/oracleclient/OracleExclusionFilter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/oracleclient/OracleExclusionFilterFactory.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/oracleclient/OracleExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/oracleclient/OracleExclusionFilter.java 2008-07-01 23:23:28 UTC (rev 2354) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/oracleclient/OracleExclusionFilter.java 2008-07-01 23:24:50 UTC (rev 2355) @@ -6,13 +6,10 @@ import org.archive.accesscontrol.RobotsUnavailableException; import org.archive.accesscontrol.RuleOracleUnavailableException; import org.archive.util.ArchiveUtils; -import org.archive.wayback.WaybackConstants; -import org.archive.wayback.core.SearchResult; -import org.archive.wayback.core.Timestamp; +import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.util.ObjectFilter; -public class OracleExclusionFilter implements ObjectFilter<SearchResult> { - ObjectFilter<SearchResult> robotFilter = null; +public class OracleExclusionFilter implements ObjectFilter<CaptureSearchResult> { AccessControlClient client = null; private String accessGroup = null; @@ -27,10 +24,9 @@ } - public int filterObject(SearchResult o) { - String url = o.get(WaybackConstants.RESULT_URL); - Date captureDate = Timestamp.parseBefore( - o.get(WaybackConstants.RESULT_CAPTURE_DATE)).getDate(); + public int filterObject(CaptureSearchResult o) { + String url = o.getOriginalUrl(); + Date captureDate = o.getCaptureDate(); Date retrievalDate = new Date(); String policy; @@ -60,12 +56,4 @@ } return FILTER_EXCLUDE; } - - public ObjectFilter<SearchResult> getRobotFilter() { - return robotFilter; - } - - public void setRobotFilter(ObjectFilter<SearchResult> robotFilter) { - this.robotFilter = robotFilter; - } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/oracleclient/OracleExclusionFilterFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/oracleclient/OracleExclusionFilterFactory.java 2008-07-01 23:23:28 UTC (rev 2354) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/oracleclient/OracleExclusionFilterFactory.java 2008-07-01 23:24:50 UTC (rev 2355) @@ -1,22 +1,17 @@ package org.archive.wayback.accesscontrol.oracleclient; import org.archive.wayback.accesscontrol.ExclusionFilterFactory; -import org.archive.wayback.accesscontrol.robotstxt.RobotExclusionFilterFactory; -import org.archive.wayback.core.SearchResult; +import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.util.ObjectFilter; public class OracleExclusionFilterFactory implements ExclusionFilterFactory { - private RobotExclusionFilterFactory robotFactory = null; private String oracleUrl = null; private String accessGroup = null; - public ObjectFilter<SearchResult> get() { + public ObjectFilter<CaptureSearchResult> get() { OracleExclusionFilter filter = new OracleExclusionFilter(oracleUrl, accessGroup); - if(robotFactory != null) { - filter.setRobotFilter(robotFactory.get()); - } return filter; } @@ -24,14 +19,6 @@ // no-op... yet.. } - public RobotExclusionFilterFactory getRobotFactory() { - return robotFactory; - } - - public void setRobotFactory(RobotExclusionFilterFactory robotFactory) { - this.robotFactory = robotFactory; - } - public String getOracleUrl() { return oracleUrl; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-07-01 23:23:18
|
Revision: 2354 http://archive-access.svn.sourceforge.net/archive-access/?rev=2354&view=rev Author: bradtofel Date: 2008-07-01 16:23:28 -0700 (Tue, 01 Jul 2008) Log Message: ----------- REFACTOR: SearchResult => (Url|Capture)SearchResult Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/CompositeExclusionFilterFactory.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/ExclusionFilterFactory.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/ExternalExcluder.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/CompositeExclusionFilterFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/CompositeExclusionFilterFactory.java 2008-07-01 23:21:32 UTC (rev 2353) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/CompositeExclusionFilterFactory.java 2008-07-01 23:23:28 UTC (rev 2354) @@ -27,7 +27,7 @@ import java.util.ArrayList; import java.util.Iterator; -import org.archive.wayback.core.SearchResult; +import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.resourceindex.filters.CompositeExclusionFilter; import org.archive.wayback.util.ObjectFilter; @@ -54,7 +54,7 @@ /* (non-Javadoc) * @see org.archive.wayback.resourceindex.ExclusionFilterFactory#get() */ - public ObjectFilter<SearchResult> get() { + public ObjectFilter<CaptureSearchResult> get() { Iterator<ExclusionFilterFactory> itr = factories.iterator(); CompositeExclusionFilter filter = new CompositeExclusionFilter(); while(itr.hasNext()) { Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/ExclusionFilterFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/ExclusionFilterFactory.java 2008-07-01 23:21:32 UTC (rev 2353) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/ExclusionFilterFactory.java 2008-07-01 23:23:28 UTC (rev 2354) @@ -24,7 +24,7 @@ */ package org.archive.wayback.accesscontrol; -import org.archive.wayback.core.SearchResult; +import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.util.ObjectFilter; /** * @@ -37,7 +37,7 @@ * @return an ObjectFilter object that filters records based on * some set of exclusion rules */ - public ObjectFilter<SearchResult> get(); + public ObjectFilter<CaptureSearchResult> get(); /** * close any resources used by this ExclusionFilter system. */ Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/ExternalExcluder.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/ExternalExcluder.java 2008-07-01 23:21:32 UTC (rev 2353) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/ExternalExcluder.java 2008-07-01 23:23:28 UTC (rev 2354) @@ -24,10 +24,8 @@ */ package org.archive.wayback.accesscontrol; -import org.apache.commons.httpclient.URIException; -import org.archive.net.LaxURI; -import org.archive.wayback.WaybackConstants; -import org.archive.wayback.core.SearchResult; +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.Timestamp; import org.archive.wayback.util.ObjectFilter; import org.springframework.beans.factory.xml.XmlBeanFactory; import org.springframework.core.io.FileSystemResource; @@ -44,12 +42,12 @@ */ public class ExternalExcluder { private static ExclusionFilterFactory factory = null; - private ObjectFilter<SearchResult> filter = null; + private ObjectFilter<CaptureSearchResult> filter = null; private final static String CONFIG_ID = "excluder-factory"; /** * @param filter */ - public ExternalExcluder(ObjectFilter<SearchResult> filter) { + public ExternalExcluder(ObjectFilter<CaptureSearchResult> filter) { this.filter = filter; } /** @@ -58,20 +56,10 @@ * @return true if the url-timestamp should not be shown to end users */ public boolean isExcluded(String urlString, String timestamp) { - SearchResult sr = new SearchResult(); + CaptureSearchResult sr = new CaptureSearchResult(); - LaxURI url = null; - String host = null; - try { - url = new LaxURI(urlString,true); - host = url.getHost(); - } catch (URIException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - return true; - } - sr.put(WaybackConstants.RESULT_ORIG_HOST, host); - sr.put(WaybackConstants.RESULT_URL, urlString); + sr.setOriginalUrl(urlString); + sr.setCaptureTimestamp(Timestamp.parseBefore(timestamp).getDateStr()); int ruling = filter.filterObject(sr); return (ruling != ObjectFilter.FILTER_INCLUDE); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-07-01 23:21:23
|
Revision: 2353 http://archive-access.svn.sourceforge.net/archive-access/?rev=2353&view=rev Author: bradtofel Date: 2008-07-01 16:21:32 -0700 (Tue, 01 Jul 2008) Log Message: ----------- REFACTOR: SearchResult => (Url|Capture)SearchResult Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java 2008-07-01 23:17:54 UTC (rev 2352) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java 2008-07-01 23:21:32 UTC (rev 2353) @@ -59,7 +59,7 @@ private String contextPrefix = null; private String serverPrefix = null; private AccessPoint context = null; - private ObjectFilter<SearchResult> exclusionFilter = null; + private ObjectFilter<CaptureSearchResult> exclusionFilter = null; private HashMap<String,String> filters = new HashMap<String,String>(); @@ -372,7 +372,6 @@ public WaybackRequest clone() { WaybackRequest wbRequest = new WaybackRequest(); - wbRequest.contextPrefix = contextPrefix; wbRequest.resultsPerPage = resultsPerPage; wbRequest.pageNum = pageNum; @@ -406,11 +405,11 @@ this.context = context; } - public ObjectFilter<SearchResult> getExclusionFilter() { + public ObjectFilter<CaptureSearchResult> getExclusionFilter() { return exclusionFilter; } - public void setExclusionFilter(ObjectFilter<SearchResult> exclusionFilter) { + public void setExclusionFilter(ObjectFilter<CaptureSearchResult> exclusionFilter) { this.exclusionFilter = exclusionFilter; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2352 http://archive-access.svn.sourceforge.net/archive-access/?rev=2352&view=rev Author: bradtofel Date: 2008-07-01 16:17:54 -0700 (Tue, 01 Jul 2008) Log Message: ----------- REFACTOR: code moved from ...resourceindex.filters. to an better defined Adapter Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureToUrlSearchResultAdapter.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureToUrlSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureToUrlSearchResultAdapter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureToUrlSearchResultAdapter.java 2008-07-01 23:17:54 UTC (rev 2352) @@ -0,0 +1,113 @@ +/* CaptureToUrlSearchResultAdapter + * + * $Id$ + * + * Created on 4:45:55 PM Jun 28, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.core; + +import java.util.HashMap; + +import org.archive.wayback.util.AdaptedIterator; +import org.archive.wayback.util.Adapter; +import org.archive.wayback.util.CloseableIterator; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ +public class CaptureToUrlSearchResultAdapter + implements Adapter<CaptureSearchResult, UrlSearchResult> { + + private String currentUrl; + private String originalUrl; + private String firstCapture; + private String lastCapture; + private int numCaptures; + private HashMap<String,Object> digests; + private UrlSearchResult resultRef = null; + public CaptureToUrlSearchResultAdapter() { + + } + private UrlSearchResult makeUrlSearchResult(CaptureSearchResult result) { + currentUrl = result.getUrlKey(); + originalUrl = result.getOriginalUrl(); + firstCapture = result.getCaptureTimestamp(); + lastCapture = firstCapture; + digests = new HashMap<String,Object>(); + digests.put(result.getDigest(),null); + numCaptures = 1; + + resultRef = new UrlSearchResult(); + resultRef.setUrlKey(currentUrl); + resultRef.setOriginalUrl(originalUrl); + resultRef.setFirstCapture(firstCapture); + resultRef.setLastCapture(lastCapture); + resultRef.setNumCaptures(1); + resultRef.setNumVersions(1); + return resultRef; + } + + /* (non-Javadoc) + * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + */ + public UrlSearchResult adapt(CaptureSearchResult c) { + String urlKey = c.getUrlKey(); + if(resultRef == null || !currentUrl.equals(urlKey)) { + return makeUrlSearchResult(c); + } + + // same url -- accumulate into the last one we returned: + String captureDate = c.getCaptureTimestamp(); + if(captureDate.compareTo(firstCapture) < 0) { + firstCapture = captureDate; + resultRef.setFirstCapture(firstCapture); + } + if(captureDate.compareTo(lastCapture) > 0) { + lastCapture = captureDate; + resultRef.setLastCapture(lastCapture); + } + numCaptures++; + digests.put(c.getDigest(), null); + resultRef.setNumCaptures(numCaptures); + resultRef.setNumVersions(digests.size()); + return null; + } + public static CloseableIterator<UrlSearchResult> adaptCaptureIterator( + CloseableIterator<CaptureSearchResult> itr) { + + // HACKHACK: this is pretty lame. We return an UrlSearchResult the + // first time we see a new urlKey, and cache a reference to the returned + // UrlSearchResult, updating it as we see subsequent CaptureSearchResult + // objects with the same urlKey. + // This means that users of the returned UrlSearchResult need to wait + // until they've got the *next* returned UrlSearchResult before using + // the *previous* UrlSearchResult. + // At the moment, this all happens inside a LocalResourceIndex, so + // none of the UrlSearchResult objects should be seen/used in any + // significant way before they've all be accumulated into an + // UrlSearchResults object.. + return new AdaptedIterator<CaptureSearchResult,UrlSearchResult>(itr, + new CaptureToUrlSearchResultAdapter()); + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-07-01 23:16:37
|
Revision: 2351 http://archive-access.svn.sourceforge.net/archive-access/?rev=2351&view=rev Author: bradtofel Date: 2008-07-01 16:16:44 -0700 (Tue, 01 Jul 2008) Log Message: ----------- REFACTOR: now these classes are directly aware of the type of SearchResult they contain. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureSearchResults.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/UrlSearchResults.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureSearchResults.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureSearchResults.java 2008-07-01 23:16:20 UTC (rev 2350) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureSearchResults.java 2008-07-01 23:16:44 UTC (rev 2351) @@ -24,7 +24,8 @@ */ package org.archive.wayback.core; -import java.text.ParseException; +import java.util.ArrayList; +import java.util.Date; import java.util.Iterator; import org.archive.wayback.WaybackConstants; @@ -36,62 +37,113 @@ * @version $Date$, $Revision$ */ public class CaptureSearchResults extends SearchResults { - public String getResultsType() { - return WaybackConstants.RESULTS_TYPE_CAPTURE; + /** + * List of UrlSearchResult objects for index records matching a query + */ + private ArrayList<CaptureSearchResult> results = + new ArrayList<CaptureSearchResult>(); + /** + * 14-digit timestamp of first capture date contained in the SearchResults + */ + private String firstResultTimestamp; + + /** + * 14-digit timestamp of last capture date contained in the SearchResults + */ + private String lastResultTimestamp; + + /** + * @return Returns the 14-digit String Timestamp of the first Capture in + * this set of SearchResult objects + */ + public String getFirstResultTimestamp() { + return firstResultTimestamp; } /** - * append a result - * @param result + * @return Returns the firstResult Date. */ - public void addSearchResult(final SearchResult result) { - addSearchResult(result,true); + public Date getFirstResultDate() { + return new Timestamp(firstResultTimestamp).getDate(); } + /** - * add a result to this results, at either the begginning or at the end, - * depending on the append argument - * @param result - * SearchResult to add to this set - * @param append + * @return Returns the 14-digit String Timestamp of the last Capture in + * this set of SearchResult objects */ - public void addSearchResult(final SearchResult result, final boolean append) { - String resultDate = result.get(WaybackConstants.RESULT_CAPTURE_DATE); - if((firstResultDate == null) || - (firstResultDate.compareTo(resultDate) > 0)) { - firstResultDate = resultDate; - } - if((lastResultDate == null) || - (lastResultDate.compareTo(resultDate) < 0)) { - lastResultDate = resultDate; - } - addSearchResultRaw(result,append); + public String getLastResultTimestamp() { + return lastResultTimestamp; } + + public Date getLastResultDate() { + return new Timestamp(lastResultTimestamp).getDate(); + } + /** * @param wbRequest - * @return The closest SearchResult to the request. - * @throws ParseException + * @return The closest CaptureSearchResult to the request. */ - public SearchResult getClosest(WaybackRequest wbRequest) { + public CaptureSearchResult getClosest(WaybackRequest wbRequest) { - SearchResult closest = null; + CaptureSearchResult closest = null; long closestDistance = 0; - SearchResult cur = null; - Timestamp wantTimestamp; - wantTimestamp = Timestamp.parseBefore(wbRequest - .get(WaybackConstants.REQUEST_EXACT_DATE)); + CaptureSearchResult cur = null; + long wantTime = Timestamp.parseBefore(wbRequest + .get(WaybackConstants.REQUEST_EXACT_DATE)).getDate().getTime(); - Iterator<SearchResult> itr = results.iterator(); + Iterator<CaptureSearchResult> itr = results.iterator(); while (itr.hasNext()) { cur = itr.next(); - long curDistance; - Timestamp curTimestamp = Timestamp.parseBefore(cur - .get(WaybackConstants.RESULT_CAPTURE_DATE)); - curDistance = curTimestamp.absDistanceFromTimestamp(wantTimestamp); - + long curDistance = Math.abs(wantTime - + cur.getCaptureDate().getTime()); + if ((closest == null) || (curDistance < closestDistance)) { closest = cur; closestDistance = curDistance; } } return closest; + } + /** + * append a result + * @param result + */ + public void addSearchResult(CaptureSearchResult result) { + addSearchResult(result,true); + } + /** + * add a result to this results, at either the begginning or at the end, + * depending on the append argument + * @param result + * SearchResult to add to this set + * @param append + */ + public void addSearchResult(CaptureSearchResult result, boolean append) { + String resultDate = result.getCaptureTimestamp(); + if((firstResultTimestamp == null) || + (firstResultTimestamp.compareTo(resultDate) > 0)) { + firstResultTimestamp = resultDate; + } + if((lastResultTimestamp == null) || + (lastResultTimestamp.compareTo(resultDate) < 0)) { + lastResultTimestamp = resultDate; + } + + if(append) { + results.add(result); + } else { + results.add(0,result); + } } + + public boolean isEmpty() { + return results.isEmpty(); + } + + public Iterator<CaptureSearchResult> iterator() { + return results.iterator(); + } + + public int size() { + return results.size(); + } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/UrlSearchResults.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/UrlSearchResults.java 2008-07-01 23:16:20 UTC (rev 2350) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/UrlSearchResults.java 2008-07-01 23:16:44 UTC (rev 2351) @@ -24,7 +24,8 @@ */ package org.archive.wayback.core; -import org.archive.wayback.WaybackConstants; +import java.util.ArrayList; +import java.util.Iterator; /** * @@ -33,19 +34,33 @@ * @version $Date$, $Revision$ */ public class UrlSearchResults extends SearchResults { + /** + * List of UrlSearchResult objects for index records matching a query + */ + private ArrayList<UrlSearchResult> results = + new ArrayList<UrlSearchResult>(); - public String getResultsType() { - return WaybackConstants.RESULTS_TYPE_URL; - } - public void addSearchResult(SearchResult result) { + public void addSearchResult(UrlSearchResult result) { addSearchResult(result,true); } - /* (non-Javadoc) - * @see org.archive.wayback.core.SearchResults#addSearchResult(org.archive.wayback.core.SearchResult, boolean) - */ - @Override - public void addSearchResult(SearchResult result, boolean append) { - addSearchResultRaw(result,append); + public void addSearchResult(UrlSearchResult result, boolean append) { + if(append) { + results.add(result); + } else { + results.add(0,result); + } } + + public boolean isEmpty() { + return results.isEmpty(); + } + + public Iterator<UrlSearchResult> iterator() { + return results.iterator(); + } + + public int size() { + return results.size(); + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-07-01 23:16:14
|
Revision: 2350 http://archive-access.svn.sourceforge.net/archive-access/?rev=2350&view=rev Author: bradtofel Date: 2008-07-01 16:16:20 -0700 (Tue, 01 Jul 2008) Log Message: ----------- REFACTOR: these are now base classes with common method for (Url|Capture)SearchResult(s) classes. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/SearchResult.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/SearchResults.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/SearchResult.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/SearchResult.java 2008-07-01 23:13:28 UTC (rev 2349) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/SearchResult.java 2008-07-01 23:16:20 UTC (rev 2350) @@ -24,10 +24,10 @@ */ package org.archive.wayback.core; -import java.util.Properties; +import java.util.Date; +import java.util.HashMap; +import java.util.Map; -import org.archive.wayback.WaybackConstants; - /** * * @@ -35,6 +35,9 @@ * @version $Date$, $Revision$ */ public class SearchResult { + + public static final String RESULT_TRUE_VALUE = "true"; + /** * Expandable Data bag for String to String tuples -- who knows what data * we'll want to put in an Index. Perhaps this should BE a Properties, @@ -42,71 +45,39 @@ * 'type' field that would allow discrimination/hinting at what kind * of data might be found in the Properties... */ - private Properties data = null; - - /** - * Constructor - */ + protected HashMap<String,String> data = null; + public SearchResult() { - super(); - data = new Properties(); + data = new HashMap<String,String>(); } - - /** - * @param key - * @return boolean true if 'key' is a key in 'data' - */ - public boolean containsKey(String key) { - return data.containsKey(key); - } - - /** - * @param key - * @return String value for key 'key' -- null if 'key' does not exist - */ public String get(String key) { - return (String) data.get(key); + return data.get(key); } - - /** - * @param key - * @param value - * @return String previous value of 'key' - */ - public String put(String key, String value) { - return (String) data.put(key, value); + public void put(String key, String value) { + data.put(key,value); } - - /** - * @return Returns the data. - */ - public Properties getData() { - return data; + public boolean getBoolean(String key) { + String value = get(key); + return (value != null && value.equals(RESULT_TRUE_VALUE)); } - - /** - * @return the (probably) 14-digit timestamp indicating when this capture - * was made. - */ - public String getCaptureDate() { - return get(WaybackConstants.RESULT_CAPTURE_DATE); + public void putBoolean(String key, boolean value) { + if(value) { + put(key,RESULT_TRUE_VALUE); + } else { + data.remove(key); + } } - - /** - * @return the url that created this request, without the leading http:// - */ - public String getUrl() { - return get(WaybackConstants.RESULT_URL); + protected String dateToTS(Date date) { + return new Timestamp(date).getDateStr(); } - - /** - * @return the url that created this request, including the leading http:// - */ - public String getAbsoluteUrl() { - String url = get(WaybackConstants.RESULT_URL); - if(url.startsWith(WaybackConstants.HTTP_URL_PREFIX)) { - return url; - } - return WaybackConstants.HTTP_URL_PREFIX + url; + protected Date tsToDate(String timestamp) { + return Timestamp.parseBefore(timestamp).getDate(); } + public Map<String, String> toCanonicalStringMap() { + return data; + } + public void fromCanonicalStringMap(Map<String, String> canonical) { + data = new HashMap<String, String>(); + data.putAll(canonical); + } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/SearchResults.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/SearchResults.java 2008-07-01 23:13:28 UTC (rev 2349) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/SearchResults.java 2008-07-01 23:16:20 UTC (rev 2350) @@ -24,9 +24,8 @@ */ package org.archive.wayback.core; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.Properties; +import java.util.HashMap; +import java.util.Map; /** * @@ -36,100 +35,42 @@ */ public abstract class SearchResults { /** - * List of SearchResult objects for index records matching a query + * Results: int total number of records matching, not all necc. returned. */ - protected ArrayList<SearchResult> results = null; + public static final String RESULTS_NUM_RESULTS = "numresults"; + /** - * 14-digit timestamp of first capture date contained in the SearchResults + * Results: int first record of all matching returned, 1-based */ - protected String firstResultDate; + public static final String RESULTS_FIRST_RETURNED = "firstreturned"; + /** - * 14-digit timestamp of last capture date contained in the SearchResults + * Results: int total number of records *returned* in results */ - protected String lastResultDate; + public static final String RESULTS_NUM_RETURNED = "numreturned"; + /** + * Results: int number of results requested + */ + public static final String RESULTS_REQUESTED = "resultsrequested"; + /** * Expandable data bag for tuples associated with the search results, * likely examples might be "total matching documents", "index of first * document returned", etc. */ - private Properties filters = new Properties(); - + private HashMap<String,String> filters = null; /** * Constructor */ public SearchResults() { - super(); - results = new ArrayList<SearchResult>(); + filters = new HashMap<String,String>(); } - /** - * @return true if no SearchResult objects, false otherwise. - */ - public boolean isEmpty() { - return results.isEmpty(); - } + private long returnedCount = -1; + private long firstReturned = -1; + private long matchingCount = -1; + private long numRequested = -1; /** - * @param result - * @param append - */ - public void addSearchResultRaw(final SearchResult result, - final boolean append) { - - if(append) { - results.add(result); - } else { - results.add(0,result); - } - } - - /** - * @return one of "Url" or "Capture" depending on the type of results - * contained in this object - */ - public abstract String getResultsType(); - - /** - * append a result - * @param result - */ - public abstract void addSearchResult(final SearchResult result); - /** - * add a result to this results, at either the begginning or at the end, - * depending on the append argument - * @param result - * SearchResult to add to this set - * @param append - */ - public abstract void addSearchResult(final SearchResult result, - final boolean append); - - /** - * @return number of SearchResult objects contained in these SearchResults - */ - public int getResultCount() { - return results.size(); - } - - /** - * @return an Iterator that contains the SearchResult objects - */ - public Iterator<SearchResult> iterator() { - return results.iterator(); - } - /** - * @return Returns the firstResultDate. - */ - public String getFirstResultDate() { - return firstResultDate; - } - /** - * @return Returns the lastResultDate. - */ - public String getLastResultDate() { - return lastResultDate; - } - - /** * @param key * @return boolean, true if key 'key' exists in filters */ @@ -142,7 +83,7 @@ * @return value of key 'key' in filters */ public String getFilter(String key) { - return filters.getProperty(key); + return filters.get(key); } /** @@ -151,12 +92,66 @@ * @return previous String value of key 'key' or null if there was none */ public String putFilter(String key, String value) { - return (String) filters.setProperty(key, value); + return (String) filters.put(key, value); } /** * @return Returns the filters. */ - public Properties getFilters() { + public Map<String,String> getFilters() { return filters; } + private long getLongFilter(String key) { + String tmp = getFilter(key); + if(tmp == null) { + return 0; + } + return Long.parseLong(tmp); + } + + public long getReturnedCount() { + if(returnedCount == -1) { + returnedCount = getLongFilter(RESULTS_NUM_RETURNED); + } + return returnedCount; + } + public void setReturnedCount(long returnedCount) { + this.returnedCount = returnedCount; + putFilter(RESULTS_NUM_RETURNED, String.valueOf(returnedCount)); + } + + public long getFirstReturned() { + if(firstReturned == -1) { + firstReturned = getLongFilter(RESULTS_FIRST_RETURNED); + } + return firstReturned; + } + + public void setFirstReturned(long firstReturned) { + this.firstReturned = firstReturned; + putFilter(RESULTS_FIRST_RETURNED, String.valueOf(firstReturned)); + } + + public long getMatchingCount() { + if(matchingCount == -1) { + matchingCount = getLongFilter(RESULTS_NUM_RESULTS); + } + return matchingCount; + } + + public void setMatchingCount(long matchingCount) { + this.matchingCount = matchingCount; + putFilter(RESULTS_NUM_RESULTS, String.valueOf(matchingCount)); + } + + public long getNumRequested() { + if(numRequested == -1) { + numRequested = getLongFilter(RESULTS_REQUESTED); + } + return numRequested; + } + + public void setNumRequested(long numRequested) { + this.numRequested = numRequested; + putFilter(RESULTS_REQUESTED, String.valueOf(numRequested)); + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-07-01 23:13:24
|
Revision: 2349 http://archive-access.svn.sourceforge.net/archive-access/?rev=2349&view=rev Author: bradtofel Date: 2008-07-01 16:13:28 -0700 (Tue, 01 Jul 2008) Log Message: ----------- MAJOR REFACTOR: previously under-specified SearchResult is now explicitly broken into CaptureSearchResult and UrlSearchResult, with handy accessor methods, moving of Constant references into these classes, and many many more edits to achieve. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureSearchResult.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/UrlSearchResult.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureSearchResult.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureSearchResult.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureSearchResult.java 2008-07-01 23:13:28 UTC (rev 2349) @@ -0,0 +1,234 @@ +/* CaptureSearchResult + * + * $Id$ + * + * Created on 7:39:24 PM Jun 26, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.core; + +import java.util.Date; + +import org.archive.wayback.util.url.UrlOperations; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ +public class CaptureSearchResult extends SearchResult { + + private long cachedOffset = -1; + private long cachedDate = -1; + + public static final String CAPTURE_ORIGINAL_URL = "originalurl"; + + /** + * Result: canonicalized(lookup key) form of URL of captured document + */ + public static final String CAPTURE_URL_KEY = "urlkey"; + + /** + * Result: 14-digit timestamp when document was captured + */ + public static final String CAPTURE_CAPTURE_TIMESTAMP = "capturetimestamp"; + + /** + * Result: basename of ARC file containing this document. + */ + public static final String CAPTURE_FILE = "file"; + + /** + * Result: compressed byte offset within ARC file where this document's + * gzip envelope begins. + */ + public static final String CAPTURE_OFFSET = "compressedoffset"; + + /** + * Result: compressed byte offset within ARC file where this document's + * gzip envelope Ends. + */ + public static final String CAPTURE_END_OFFSET = "compressedendoffset"; + + /** + * Result: best-guess at mime-type of this document. + */ + public static final String CAPTURE_MIME_TYPE = "mimetype"; + + /** + * Result: 3-digit integer HTTP response code. may be '0' in some + * fringe conditions, old ARCs, bug in crawler, etc. + */ + public static final String CAPTURE_HTTP_CODE = "httpresponsecode"; + + /** + * Result: some form of document fingerprint. This should represent the + * HTTP payload only for HTTP captured resources. It may represent an MD5, a + * SHA1, and may be a fragment of the full representation of the digest. + */ + public static final String CAPTURE_DIGEST= "digest"; + + /** + * Result: URL that this document redirected to, or '-' if it does + * not redirect + */ + public static final String CAPTURE_REDIRECT_URL = "redirecturl"; + + /** + * Result: flag within a SearchResult that indicates this is the closest to + * a particular requested date. + */ + public static final String CAPTURE_CLOSEST_INDICATOR = "closest"; + public static final String CAPTURE_CLOSEST_VALUE = "true"; + + /** + * Result: this key being present indicates that this particular capture + * was not actually stored, and that other values within this SearchResult + * are actually values from a different record which *should* be identical + * to this capture, had it been stored. + */ + public static final String CAPTURE_DUPLICATE_ANNOTATION = "duplicate"; + + /** + * Result: this key is present when the CAPTURE_DUPLICATE_ANNOTATION is also + * present, with the value indicating the last date that was actually + * stored for this duplicate. + */ + public static final String CAPTURE_DUPLICATE_STORED_TS = "duplicate-ts"; + + /** + * flag indicates that this document was downloaded and verified as + * identical to a previous capture by digest. + */ + public static final String CAPTURE_DUPLICATE_DIGEST = "digest"; + + /** + * flag indicates that this document was NOT downloaded, but that the + * origin server indicated that the document had not changed, based on + * If-Modified HTTP request headers. + */ + public static final String CAPTURE_DUPLICATE_HTTP = "http"; + public String getOriginalUrl() { + return get(CAPTURE_ORIGINAL_URL); + } + public void setOriginalUrl(String originalUrl) { + put(CAPTURE_ORIGINAL_URL,originalUrl); + } + public String getOriginalHost() { + return UrlOperations.urlToHost(getOriginalUrl()); + } + public String getUrlKey() { + return get(CAPTURE_URL_KEY); + } + public void setUrlKey(String urlKey) { + put(CAPTURE_URL_KEY,urlKey); + } + public Date getCaptureDate() { + if(cachedDate == -1) { + cachedDate = tsToDate(getCaptureTimestamp()).getTime(); + } + return new Date(cachedDate); + } + public void setCaptureDate(Date date) { + cachedDate = date.getTime(); + put(CAPTURE_CAPTURE_TIMESTAMP, dateToTS(date)); + } + public String getCaptureTimestamp() { + return get(CAPTURE_CAPTURE_TIMESTAMP); + } + public void setCaptureTimestamp(String timestamp) { + put(CAPTURE_CAPTURE_TIMESTAMP,timestamp); + } + public String getFile() { + return get(CAPTURE_FILE); + } + public void setFile(String file) { + put(CAPTURE_FILE, file); + } + public long getOffset() { + if(cachedOffset == -1) { + cachedOffset = Long.parseLong(get(CAPTURE_OFFSET)); + } + return cachedOffset; + } + public void setOffset(long offset) { + cachedOffset = offset; + put(CAPTURE_OFFSET,String.valueOf(offset)); + } + public String getMimeType() { + return get(CAPTURE_MIME_TYPE); + } + public void setMimeType(String mimeType) { + put(CAPTURE_MIME_TYPE,mimeType); + } + public String getHttpCode() { + return get(CAPTURE_HTTP_CODE); + } + public void setHttpCode(String httpCode) { + put(CAPTURE_HTTP_CODE,httpCode); + } + public String getDigest() { + return get(CAPTURE_DIGEST); + } + public void setDigest(String digest) { + put(CAPTURE_DIGEST,digest); + } + public String getRedirectUrl() { + return get(CAPTURE_REDIRECT_URL); + } + public void setRedirectUrl(String url) { + put(CAPTURE_REDIRECT_URL,url); + } + public boolean isClosest() { + return getBoolean(CAPTURE_CLOSEST_INDICATOR); + } + public void setClosest(boolean value) { + putBoolean(CAPTURE_CLOSEST_INDICATOR,value); + } + public void flagDuplicateDigest(Date storedDate) { + put(CAPTURE_DUPLICATE_ANNOTATION,CAPTURE_DUPLICATE_DIGEST); + put(CAPTURE_DUPLICATE_STORED_TS,dateToTS(storedDate)); + } + public void flagDuplicateDigest(String storedTS) { + put(CAPTURE_DUPLICATE_ANNOTATION,CAPTURE_DUPLICATE_DIGEST); + put(CAPTURE_DUPLICATE_STORED_TS,storedTS); + } + public boolean isDuplicateDigest() { + String dupeType = get(CAPTURE_DUPLICATE_ANNOTATION); + return (dupeType != null && dupeType.equals(CAPTURE_DUPLICATE_DIGEST)); + } + public Date getDuplicateDigestStoredDate() { + String dupeType = get(CAPTURE_DUPLICATE_ANNOTATION); + Date date = null; + if(dupeType != null && dupeType.equals(CAPTURE_DUPLICATE_DIGEST)) { + date = tsToDate(get(CAPTURE_DUPLICATE_STORED_TS)); + } + return date; + } + public String getDuplicateDigestStoredTimestamp() { + String dupeType = get(CAPTURE_DUPLICATE_ANNOTATION); + String ts = null; + if(dupeType != null && dupeType.equals(CAPTURE_DUPLICATE_DIGEST)) { + ts = get(CAPTURE_DUPLICATE_STORED_TS); + } + return ts; + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/UrlSearchResult.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/UrlSearchResult.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/UrlSearchResult.java 2008-07-01 23:13:28 UTC (rev 2349) @@ -0,0 +1,117 @@ +/* UrlSearchResult + * + * $Id$ + * + * Created on 7:42:06 PM Jun 26, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.core; + +import java.util.Date; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ +public class UrlSearchResult extends SearchResult { + private long cachedFirst = -1; + private long cachedLast = -1; + private long cachedNumVersions = -1; + private long cachedNumCaptures = -1; + + public final static String URL_KEY = "urlkey"; + public final static String URL_ORIGINAL_URL = "originalurl"; + public final static String URL_FIRST_CAPTURE_TIMESTAMP = "firstcapturets"; + public final static String URL_LAST_CAPTURE_TIMESTAMP = "lastcapturets"; + public final static String URL_NUM_CAPTURES = "numcaptures"; + public final static String URL_NUM_VERSIONS = "numversions"; + public String getUrlKey() { + return get(URL_KEY); + } + public void setUrlKey(String urlKey) { + put(URL_KEY,urlKey); + } + public String getOriginalUrl() { + return get(URL_ORIGINAL_URL); + } + public void setOriginalUrl(String originalUrl) { + put(URL_ORIGINAL_URL,originalUrl); + } + public String getFirstCaptureTimestamp() { + return get(URL_FIRST_CAPTURE_TIMESTAMP); + } + public Date getFirstCaptureDate() { + if(cachedFirst == -1) { + cachedFirst = tsToDate(getFirstCaptureTimestamp()).getTime(); + } + return new Date(cachedFirst); + } + public void setFirstCapture(Date date) { + cachedFirst = date.getTime(); + put(URL_FIRST_CAPTURE_TIMESTAMP, dateToTS(date)); + } + public void setFirstCapture(String timestamp) { + put(URL_FIRST_CAPTURE_TIMESTAMP, timestamp); + } + public String getLastCaptureTimestamp() { + return get(URL_LAST_CAPTURE_TIMESTAMP); + } + public Date getLastCaptureDate() { + if(cachedLast == -1) { + cachedLast = tsToDate(getLastCaptureTimestamp()).getTime(); + } + return new Date(cachedLast); + } + public void setLastCapture(Date date) { + cachedLast = date.getTime(); + put(URL_LAST_CAPTURE_TIMESTAMP, dateToTS(date)); + } + public void setLastCapture(String timestamp) { + put(URL_LAST_CAPTURE_TIMESTAMP, timestamp); + } + public long getNumCaptures() { + if(cachedNumCaptures == -1) { + cachedNumCaptures = Long.parseLong(get(URL_NUM_CAPTURES)); + } + return cachedNumCaptures; + } + public void setNumCaptures(long numCaptures) { + cachedNumCaptures = numCaptures; + put(URL_NUM_CAPTURES,String.valueOf(numCaptures)); + } + public void setNumCaptures(String numCaptures) { + put(URL_NUM_CAPTURES,numCaptures); + } + public long getNumVersions() { + if(cachedNumVersions == -1) { + cachedNumVersions = Long.parseLong(get(URL_NUM_VERSIONS)); + } + return cachedNumVersions; + } + public void setNumVersions(long numVersions) { + cachedNumVersions = numVersions; + put(URL_NUM_VERSIONS,String.valueOf(numVersions)); + } + public void setNumVersions(String numVersions) { + put(URL_NUM_VERSIONS,numVersions); + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-07-01 23:06:26
|
Revision: 2348 http://archive-access.svn.sourceforge.net/archive-access/?rev=2348&view=rev Author: binzino Date: 2008-07-01 16:06:32 -0700 (Tue, 01 Jul 2008) Log Message: ----------- Added "-u" to sort command. Fixed error in usage info. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/bin/dedup-cdx Modified: trunk/archive-access/projects/nutchwax/archive/bin/dedup-cdx =================================================================== --- trunk/archive-access/projects/nutchwax/archive/bin/dedup-cdx 2008-07-01 22:52:08 UTC (rev 2347) +++ trunk/archive-access/projects/nutchwax/archive/bin/dedup-cdx 2008-07-01 23:06:32 UTC (rev 2348) @@ -13,14 +13,14 @@ echo echo "Output is in abbreviated form of \"URL digest date\", ex:" echo + echo " example.org sha1:H4NTDLP5DNH6KON63ZALKEV5ELVUDGXJ 20070208173443" echo " example.org sha1:H4NTDLP5DNH6KON63ZALKEV5ELVUDGXJ 20080626121505" - echo " example.org sha1:H4NTDLP5DNH6KON63ZALKEV5ELVUDGXJ 20070208173443" echo echo "The output of this script can be used as an exclusions file for" - echo "importing (W)ARC files with NutchWAX, and also for adding dates" + echo "importing ARC files with NutchWAX, and also for adding dates" echo "to a parallel index." echo exit 1; fi -cat $@ | awk '{ print $1 " sha1:" $6 " " $2 }' | sort | awk '{ if ( url == $1 && digest == $2 ) print $1 " " $2 " " $3 ; url = $1 ; digest = $2 }' +cat $@ | awk '{ print $1 " sha1:" $6 " " $2 }' | sort -u | awk '{ if ( url == $1 && digest == $2 ) print $1 " " $2 " " $3 ; url = $1 ; digest = $2 }' This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-07-01 22:52:05
|
Revision: 2347 http://archive-access.svn.sourceforge.net/archive-access/?rev=2347&view=rev Author: binzino Date: 2008-07-01 15:52:08 -0700 (Tue, 01 Jul 2008) Log Message: ----------- Moved read logic to readBytes() method. Also fixed bug WAX-9, so now up to nutchwax.import.content.limit bytes are read, or all bytes if that property is not defined or has a value of -1. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-07-01 22:41:57 UTC (rev 2346) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-07-01 22:52:08 UTC (rev 2347) @@ -71,25 +71,24 @@ /** - * Convert Archive files (.arc/.warc) files to a Nutch segment. This - * is sometimes called "importing" other times "converting", the terms - * are equivalent. + * Import Archive files (.arc/.warc) files into a newly-created Nutch + * segment. * - * <code>Importer</code> is coded as a Hadoop job and is intended - * to be run within the Hadoop framework, or at least started by the + * <code>Importer</code> is coded as a Hadoop job and is intended to + * be run within the Hadoop framework, or at least started by the * Hadoop launcher incorporated into Nutch. Although there is a * <code>main</code> driver, the Nutch launcher script is strongly * recommended. * * This class was initially adapted from the Nutch - * <code>Fetcher</code> class. The premise is since the Nutch - * fetching process acquires external content and places it in a Nutch - * segment, we can perform a similar activity by taking content from - * the ARC files and place that content in a Nutch segment in a - * similar fashion. Ideally, once the <code>Importer</code> is - * used to import a set of ARCs into a Nutch segment, the resulting - * segment should be more-or-less the same as one created by Nutch's - * own Fetcher. + * <code>Fetcher</code> and <code>ArcSegmentCreator</code> classes. + * The premise is since the Nutch fetching process acquires external + * content and places it in a Nutch segment, we can perform a similar + * activity by taking content from the ARC files and place that + * content in a Nutch segment in a similar fashion. Ideally, once the + * <code>Importer</code> is used to import a set of ARCs into a Nutch + * segment, the resulting segment should be more-or-less the same as + * one created by Nutch's own Fetcher. * * Since we are mimicing the Nutch Fetcher, we have to be careful * about some implementation details that might not seem relevant @@ -241,18 +240,16 @@ // headers. record.skipHttpHeader(); - // TODO: Put in a size limiter, akin to Nutch's file.content.limit. - - // Read the bytes of the HTTP response - byte[] bytes = new byte[(int) meta.getLength()]; - - // NOTE: Do not use read(byte[]) because ArchiveRecord does NOT over-ride - // the implementation inherited from InputStream. And since it does - // not over-ride it, it won't do the digesting on it. Must use either - // read(byte[],offset,length) or read(). - record.read( bytes, 0, bytes.length ); + // We use record.available() rather than meta.getLength() + // because the latter includes the size of the HTTP header, + // which we just skipped. + byte[] bytes = readBytes( record, record.available( ) ); - // Must call close() for digest calculation to be finished. + // If there is no digest, then we assume we're reading an + // ARCRecord not a WARCRecord. In that case, we close the + // record, which updates the digest string. Then we tweak the + // digest string so we have the same for for both ARC and WARC + // records. if ( meta.getDigest() == null ) { record.close(); @@ -505,6 +502,67 @@ } /** + * Utility method to read the content bytes from an archive record. + * The number of bytes read can be limited via the configuration + * property <code>nutchwax.import.content.limit</code>. + */ + private byte[] readBytes( ARCRecord record, long contentLength ) + throws IOException + { + // Ensure the record does strict reading. + record.setStrict( true ); + + long size = jobConf.getLong( "nutchwax.import.content.limit", -1 ); + + if ( size < 0 ) + { + size = contentLength; + } + else + { + size = Math.min( size, contentLength ); + } + + // Read the bytes of the HTTP response + byte[] bytes = new byte[(int) size]; + + if ( size == 0 ) + { + return bytes; + } + + // NOTE: Do not use read(byte[]) because ArchiveRecord does NOT over-ride + // the implementation inherited from InputStream. And since it does + // not over-ride it, it won't do the digesting on it. Must use either + // read(byte[],offset,length) or read(). + int pos = 0; + while ( (pos += record.read( bytes, pos, (bytes.length - pos) )) < bytes.length ) + ; + + // Now that the bytes[] buffer has been filled, read the remainder + // of the record so that the digest is computed over the entire + // content. + byte[] buf = new byte[1024 * 1024]; + int count = 0; + while ( record.available( ) > 0 ) + { + count += record.read( buf, 0, Math.min( buf.length, record.available( ) ) ); + } + + if ( LOG.isInfoEnabled() ) LOG.info( "Bytes read: expected=" + contentLength + " bytes.length=" + bytes.length + " pos=" + pos + " count=" + count ); + + // Sanity check. The number of bytes read into our bytes[] + // buffer, plus the count of extra stuff read after it should + // equal the contentLength passed into this function. + if ( pos + count != contentLength ) + { + throw new IOException( "Incorrect number of bytes read from ArchiveRecord: expected=" + contentLength + " bytes.length=" + bytes.length + " pos=" + pos + " count=" + count ); + } + + return bytes; + } + + /** * */ public int run( String[] args ) throws Exception This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-07-01 22:41:48
|
Revision: 2346 http://archive-access.svn.sourceforge.net/archive-access/?rev=2346&view=rev Author: binzino Date: 2008-07-01 15:41:57 -0700 (Tue, 01 Jul 2008) Log Message: ----------- Added nutchwax.import.content.limit property. And more comments. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml Modified: trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml 2008-06-30 20:38:36 UTC (rev 2345) +++ trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml 2008-07-01 22:41:57 UTC (rev 2346) @@ -13,6 +13,16 @@ <value>protocol-http|parse-(text|html|js|pdf)|index-(basic|nutchwax)|query-(basic|site|url|nutchwax)|summary-basic|scoring-opic|urlfilter-nutchwax</value> </property> +<!-- The indexing filter order *must* be specified in order for + NutchWAX's ConfigurableIndexingFilter to be called *after* the + BasicIndexingFilter. This is necessary so that the + ConfigurableIndexingFilter can over-write some of the values put + into the Lucene document by the BasicIndexingFilter. + + The over-written values are the 'url' and 'digest' fields, which + NutchWAX needs to handle specially in order for de-duplication to + work properly. + --> <property> <name>indexingfilter.order</name> <value> @@ -78,16 +88,38 @@ <description>Defines if the mime content type detector uses magic resolution.</description> </property> +<!-- Normally, this is specified on the command line with the NutchWAX + Importer is invoked. It can be specified here if the user + prefers. + --> <property> <name>nutchwax.urlfilter.wayback.exclusions</name> <value></value> <description>Path to file containing list of exclusions.</description> </property> +<!-- For CDX-based de-duplication to work properly, you must use the + same Wayback URLCanonicalizer that is used by the "(w)arc-indexer" + utility. By default, this is AggressiveUrlCanonicalizer, but + could by IdentityCanonicalizer if you use the "-i" (identity) option + with "(w)arc-indexer". + --> <property> <name>nutchwax.urlfilter.wayback.canonicalizer</name> <value>org.archive.wayback.util.url.AggressiveUrlCanonicalizer</value> <description></description> </property> +<!-- Similar to Nutch's + file.content.limit + http.content.limit + ftp.content.limit + properties, this specifies a limit on the size of a document + imported via NutchWAX. + --> +<property> + <name>nutchwax.import.content.limit</name> + <value>1048576</value> +</property> + </configuration> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2345 http://archive-access.svn.sourceforge.net/archive-access/?rev=2345&view=rev Author: binzino Date: 2008-06-30 13:38:36 -0700 (Mon, 30 Jun 2008) Log Message: ----------- Changed logic to that the RangeQuery has a boost of 0.0f and is always required. This is necessary for Nutch to auto-convert the RangeQuery into a RangeFilter. Added class-level JavaDoc. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/DateQueryFilter.java Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/DateQueryFilter.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/DateQueryFilter.java 2008-06-29 03:07:43 UTC (rev 2344) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/query-nutchwax/src/java/org/archive/nutchwax/query/DateQueryFilter.java 2008-06-30 20:38:36 UTC (rev 2345) @@ -37,7 +37,28 @@ import org.apache.nutch.searcher.QueryFilter; /** - * + * <p> + * Filter on a date or date range. This filter assumes the dates + * are in field named "date" and adhere to the IA 14-digit date + * format: YYYYMMDDHHMMSS + * </p> + * <p> + * Date values in the query can have less than the full 14-digit + * precision. In that case, they are converted into a range over + * given precision. For example, "date:2007" is automagically + * converted into "date[20070000000000-20079999999999]". + * </p> + * <p> + * NOTE: In order for this filter to take advantage of the Nutch + * auto-magic conversion of RangeQuery into RangeFilter, we have to + * create the RangeQuery with: + * <ul> + * <li>occur = BooleanClause.Occur.MUST</li> + * <li>boost = 0.0f;</li> + * </ul> + * These are the two conditions that Nutch's LuceneQueryOptimizer + * checks before doing a RangeQuery->RangeFilter conversion. + * </p> */ public class DateQueryFilter implements QueryFilter { @@ -122,17 +143,14 @@ return ; } + // Otherwise make it plain-old TermQuery to match the exact date. TermQuery term = new TermQuery( new Term( FIELD, date ) ); - // Set boost on term? - // term.setBoolst( boost ); + // Not strictly required since this is a TermQuery and not a + // RangeQuery, but we use the same 0.0f boost for consistency. + term.setBoost( 0.0f ); - output.add( term, - ( clause.isProhibited() - ? BooleanClause.Occur.MUST_NOT - : ( clause.isRequired() - ? BooleanClause.Occur.MUST - : BooleanClause.Occur.SHOULD ) ) ); + output.add( term, BooleanClause.Occur.MUST ); } private void doRangeQuery( BooleanQuery output, Clause clause, String lower, String upper ) @@ -143,16 +161,11 @@ RangeQuery range = new RangeQuery( new Term( FIELD, lower ), new Term( FIELD, upper ), true ); - - // Set boost on range query? - // range.setBoost( boost ); - output.add( range, - ( clause.isProhibited() - ? BooleanClause.Occur.MUST_NOT - : ( clause.isRequired() - ? BooleanClause.Occur.MUST - : BooleanClause.Occur.SHOULD ) ) ); + // Required for LuceneQueryOptimizer to convert to RangeFilter. + range.setBoost( 0.0f ); + + output.add( range, BooleanClause.Occur.MUST ); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-06-29 03:07:34
|
Revision: 2344 http://archive-access.svn.sourceforge.net/archive-access/?rev=2344&view=rev Author: binzino Date: 2008-06-28 20:07:43 -0700 (Sat, 28 Jun 2008) Log Message: ----------- Fixed type-o in usage info. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/bin/nutchwax Modified: trunk/archive-access/projects/nutchwax/archive/bin/nutchwax =================================================================== --- trunk/archive-access/projects/nutchwax/archive/bin/nutchwax 2008-06-29 00:20:16 UTC (rev 2343) +++ trunk/archive-access/projects/nutchwax/archive/bin/nutchwax 2008-06-29 03:07:43 UTC (rev 2344) @@ -55,8 +55,8 @@ echo "Usage: nutchwax COMMAND" echo "where COMMAND is one of:" echo " import Import ARCs into a new Nutch segment" - echo " adddates Add dates to a parallel index" - echo " dumpindex Dump a (parallel) index to the screen" + echo " add-dates Add dates to a parallel index" + echo " dumpindex Dump an index or set of parallel indices to stdout" echo "" exit 1 ;; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-06-29 00:20:07
|
Revision: 2343 http://archive-access.svn.sourceforge.net/archive-access/?rev=2343&view=rev Author: binzino Date: 2008-06-28 17:20:16 -0700 (Sat, 28 Jun 2008) Log Message: ----------- Changed "archive-digest" to "digest" to match changes in NutchWax code. Added "exclusive" property to ConfigurableIndexingFilter config. Added explicit ordering of index filters so that ours is called last so it can over-write metadata values: url, orig, digest. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml Modified: trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml 2008-06-29 00:17:48 UTC (rev 2342) +++ trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml 2008-06-29 00:20:16 UTC (rev 2343) @@ -10,21 +10,32 @@ <!-- Add 'index-nutchwax' and 'query-nutchwax' to plugin list. --> <!-- Also, add 'parse-pdf' --> <!-- Remove 'urlfilter-regex' and 'normalizer-(pass|regex|basic)' --> - <value>protocol-http|parse-(text|html|js|pdf)|index-(basic|anchor|nutchwax)|query-(basic|site|url|nutchwax)|summary-basic|scoring-opic|urlfilter-nutchwax</value> + <value>protocol-http|parse-(text|html|js|pdf)|index-(basic|nutchwax)|query-(basic|site|url|nutchwax)|summary-basic|scoring-opic|urlfilter-nutchwax</value> </property> <property> + <name>indexingfilter.order</name> + <value> + org.apache.nutch.indexer.basic.BasicIndexingFilter + org.archive.nutchwax.index.ConfigurableIndexingFilter + </value> +</property> + +<property> <!-- Configure the 'index-nutchwax' plugin. Specify how the metadata fields added by the ArcsToSegment are mapped to the Lucene documents during indexing. The specifications here are of the form "src-key:lowercase:store:tokenize:dest-key" Where the only required part is the "src-key", the rest will assume the following defaults: lowercase = true store = true tokenize = false + exclusive = true dest-key = src-key --> <name>nutchwax.filter.index</name> <value> - archive-digest:false + url:false:true:true + orig:false + digest:false arcname:false collection date @@ -46,7 +57,7 @@ <!-- We do *not* use this filter for handling "date" queries, there is a specific filter for that: DateQueryFilter --> <name>nutchwax.filter.query</name> <value> - raw:archive-digest:false + raw:digest:false raw:arcname:false group:collection group:type This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-06-29 00:17:39
|
Revision: 2342 http://archive-access.svn.sourceforge.net/archive-access/?rev=2342&view=rev Author: binzino Date: 2008-06-28 17:17:48 -0700 (Sat, 28 Jun 2008) Log Message: ----------- Changed "key" used to identify document from URL to URL+digest. Also, this value is stored in a metadata field named "orig" or order to work-around a bad assumption in Nutch's FetchedSegements.getUrl(). Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-06-28 23:55:28 UTC (rev 2341) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-06-29 00:17:48 UTC (rev 2342) @@ -276,11 +276,38 @@ return false; } + // We create a key which combines the URL and digest values. + // This is necessary because Nutch stores all the data in + // MapFiles, which are basically just {key,value} pairs. + // + // If we use just the URL as the key (which is the way Nutch + // usually works) then we have problems with multiple, + // different copies of the same URL. If we try and store two + // different copies of the same URL (each having a different + // digest) and only use the URL as the key, when the MapFile + // is written, only *one* copy of the page will be stored. + // + // Think about it, we're basically doing: + // MapFile.put( url, value1 ); + // MapFile.put( url, value2 ); + // Only one of those url,value mappings will keep, the other + // is over-written. + // + // So, by using the url+digest as the key, we can have all the + // data stored. The only problem is all over in Nutch where + // the key==url is assumed :( + String key = url + " " + meta.getDigest( ); + Metadata contentMetadata = new Metadata(); // Set the segment name, just as is done by standard Nutch fetching. // Then, add the NutchWAX-specific metadata fields. contentMetadata.set( Nutch .SEGMENT_NAME_KEY, segmentName ); + // We store both the normal URL and the URL+digest key for + // later retrieval by the indexing plugin(s). + contentMetadata.set( NutchWax.URL_KEY, url ); + contentMetadata.set( NutchWax.ORIG_KEY, key ); + contentMetadata.set( NutchWax.CONTENT_TYPE_KEY, meta.getMimetype() ); contentMetadata.set( NutchWax.ARCNAME_KEY, meta.getArcFile().getName() ); contentMetadata.set( NutchWax.COLLECTION_KEY, collectionName ); @@ -289,7 +316,7 @@ Content content = new Content( url, url, bytes, meta.getMimetype(), contentMetadata, getConf() ); - output( output, new Text( url ), content ); + output( output, new Text( key ), content ); return true; } @@ -342,7 +369,9 @@ Text key, Content content ) { - // Create the datum + LOG.debug( "output( " + key + " )" ); + + // Create the crawl datum. This CrawlDatum datum = new CrawlDatum( CrawlDatum.STATUS_FETCH_SUCCESS, this.interval, 1.0f ); // ?: I have no idea why we need to store the ProtocolStatus in @@ -418,7 +447,7 @@ { for ( Entry<Text, Parse> entry : parseResult ) { - Text url = entry.getKey(); + Text url = entry.getKey(); Parse parse = entry.getValue(); ParseStatus parseStatus = parse.getData().getStatus(); @@ -440,10 +469,20 @@ parse.getData().getContentMeta().set( Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature) ); parse.getData().getContentMeta().set( Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime() ) ); + // ?: What is this all about? It was in the original ArcSegmentCreator.java that + // inspired this code. But I can't figure out why we need it. If anything + // this will always be false since our key is now URL+digest, not just URL. + // Since it's always false, let's leave it out. + /* if ( url.equals( key ) ) { datum.setSignature( signature ); } + else + { + if ( LOG.isWarnEnabled() ) LOG.warn( "ParseResult entry key and url differ: key=" + key + " url=" + url ); + } + */ // ?: As above, we'll leave the scoring hooks in place. try @@ -455,7 +494,7 @@ if ( LOG.isWarnEnabled() ) LOG.warn( "Couldn't pass score, url = " + key, e ); } - output.collect( url, new NutchWritable( new ParseImpl( new ParseText( parse.getText() ), parse.getData(), parse.isCanonical() ) ) ); + output.collect( key, new NutchWritable( new ParseImpl( new ParseText( parse.getText() ), parse.getData(), parse.isCanonical() ) ) ); } } } Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2008-06-28 23:55:28 UTC (rev 2341) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2008-06-29 00:17:48 UTC (rev 2342) @@ -41,14 +41,15 @@ import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; +import org.archive.nutchwax.NutchWax; + + /** * Reads series of (digest+URL,date) lines, finds corresponding * document in index, and adds the date to it. */ public class DateAdder { - - public static void main(String[] args) throws Exception { @@ -117,45 +118,35 @@ Document oldDoc = reader.document( i ); Document newDoc = new Document( ); - // Copy the source values to the new document. - /* - String dates[] = oldDoc.getValues( "date" ); - - if ( dates != null ) - { - for ( String date : dates ) - { - newDoc.add( new Field( "date", date, Field.Store.YES, Field.Index.UN_TOKENIZED ) ); - } - } - */ + // Copy the values from all the source indices to the new + // document. Set<String> uniqueDates = new HashSet<String>( ); for ( IndexReader source : sourceReaders ) { Document sourceDoc = source.document( i ); - String dates[] = sourceDoc.getValues( "date" ); + String dates[] = sourceDoc.getValues( NutchWax.DATE_KEY ); java.util.Collections.addAll( uniqueDates, dates ); } for ( String date : uniqueDates ) { - newDoc.add( new Field( "date", date, Field.Store.YES, Field.Index.UN_TOKENIZED ) ); + newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.UN_TOKENIZED ) ); } // First, apply URL canonicalization from Wayback - String canonicalizedUrl = canonicalizer.urlStringToKey( oldDoc.get( "url" ) ); + String canonicalizedUrl = canonicalizer.urlStringToKey( oldDoc.get( NutchWax.URL_KEY ) ); // Now, get the digest+ URL of the document, look for it in // the updateRecords and if found, add the date. - String key = canonicalizedUrl + oldDoc.get( "archive-digest" ); + String key = canonicalizedUrl + oldDoc.get( NutchWax.DIGEST_KEY ); String newDates = dateRecords.get( key ); if ( newDates != null ) { for ( String date : newDates.split("\\s+") ) { - newDoc.add( new Field( "date", date, Field.Store.YES, Field.Index.UN_TOKENIZED ) ); + newDoc.add( new Field( NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.UN_TOKENIZED ) ); } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-06-28 23:55:19
|
Revision: 2341 http://archive-access.svn.sourceforge.net/archive-access/?rev=2341&view=rev Author: binzino Date: 2008-06-28 16:55:28 -0700 (Sat, 28 Jun 2008) Log Message: ----------- Added keys for more metadata fields. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java 2008-06-28 23:41:37 UTC (rev 2340) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWax.java 2008-06-28 23:55:28 UTC (rev 2341) @@ -22,9 +22,11 @@ public class NutchWax { + public static final String URL_KEY = "url"; + public static final String ORIG_KEY = "orig"; public static final String ARCNAME_KEY = "arcname"; public static final String COLLECTION_KEY = "collection"; public static final String CONTENT_TYPE_KEY = "type"; public static final String DATE_KEY = "date"; - public static final String DIGEST_KEY = "archive-digest"; + public static final String DIGEST_KEY = "digest"; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-06-28 23:41:27
|
Revision: 2340 http://archive-access.svn.sourceforge.net/archive-access/?rev=2340&view=rev Author: binzino Date: 2008-06-28 16:41:37 -0700 (Sat, 28 Jun 2008) Log Message: ----------- Clean-up a little lingering copy/paste in the extension name. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/plugin.xml Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/plugin.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/plugin.xml 2008-06-28 23:40:31 UTC (rev 2339) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/plugin.xml 2008-06-28 23:41:37 UTC (rev 2340) @@ -35,7 +35,7 @@ <import plugin="nutch-extensionpoints"/> </requires> - <extension id="org.apache.nutch.indexer.basic" + <extension id="org.archive.nutchwax.index" name="Configurable Indexing Filter" point="org.apache.nutch.indexer.IndexingFilter"> <implementation id="ConfigurableIndexingFilter" This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2339 http://archive-access.svn.sourceforge.net/archive-access/?rev=2339&view=rev Author: binzino Date: 2008-06-28 16:40:31 -0700 (Sat, 28 Jun 2008) Log Message: ----------- Adde "exclusive" property in which existing values will be removed from the document before any are added by this filter. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java Modified: trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java 2008-06-28 23:38:44 UTC (rev 2338) +++ trunk/archive-access/projects/nutchwax/archive/src/plugin/index-nutchwax/src/java/org/archive/nutchwax/index/ConfigurableIndexingFilter.java 2008-06-28 23:40:31 UTC (rev 2339) @@ -69,22 +69,25 @@ boolean lowerCase = true; boolean store = true; boolean tokenize = false; + boolean exclusive = true; String destKey = srcKey; switch ( spec.length ) { + case 6: + destKey = spec[5]; case 5: - destKey = spec[4]; + exclusive = Boolean.parseBoolean( spec[4] ); case 4: - tokenize = Boolean.parseBoolean( spec[3] ); + tokenize = Boolean.parseBoolean( spec[3] ); case 3: - store = Boolean.parseBoolean( spec[2] ); + store = Boolean.parseBoolean( spec[2] ); case 2: lowerCase = Boolean.parseBoolean( spec[1] ); } - LOG.info( "Add field specification: " + srcKey + ":" + lowerCase + ":" + store + ":" + tokenize + ":" + destKey ); + LOG.info( "Add field specification: " + srcKey + ":" + lowerCase + ":" + store + ":" + tokenize + ":" + exclusive + ":" + destKey ); - this.fieldSpecs.add( new FieldSpecification( srcKey, lowerCase, store, tokenize, destKey ) ); + this.fieldSpecs.add( new FieldSpecification( srcKey, lowerCase, store, tokenize, exclusive, destKey ) ); } } @@ -94,14 +97,16 @@ boolean lowerCase; boolean store; boolean tokenize; + boolean exclusive; String destKey; - public FieldSpecification( String srcKey, boolean lowerCase, boolean store, boolean tokenize, String destKey ) + public FieldSpecification( String srcKey, boolean lowerCase, boolean store, boolean tokenize, boolean exclusive, String destKey ) { this.srcKey = srcKey; this.lowerCase = lowerCase; this.store = store; this.tokenize = tokenize; + this.exclusive = exclusive; this.destKey = destKey; } } @@ -130,6 +135,11 @@ { value = value.toLowerCase( ); } + + if ( spec.exclusive ) + { + doc.removeFields( spec.destKey ); + } doc.add( new Field( spec.destKey, value, This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-06-28 23:38:34
|
Revision: 2338 http://archive-access.svn.sourceforge.net/archive-access/?rev=2338&view=rev Author: binzino Date: 2008-06-28 16:38:44 -0700 (Sat, 28 Jun 2008) Log Message: ----------- Change name to of Java class called for importing, matches change in Java class name: ArchiveImporter -> Importer. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/bin/nutchwax Modified: trunk/archive-access/projects/nutchwax/archive/bin/nutchwax =================================================================== --- trunk/archive-access/projects/nutchwax/archive/bin/nutchwax 2008-06-28 23:37:51 UTC (rev 2337) +++ trunk/archive-access/projects/nutchwax/archive/bin/nutchwax 2008-06-28 23:38:44 UTC (rev 2338) @@ -40,7 +40,7 @@ case "$1" in import) shift - ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.ArchiveImporter $@ + ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.Importer $@ ;; add-dates) shift This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-06-28 23:37:42
|
Revision: 2337 http://archive-access.svn.sourceforge.net/archive-access/?rev=2337&view=rev Author: binzino Date: 2008-06-28 16:37:51 -0700 (Sat, 28 Jun 2008) Log Message: ----------- Added some additional output to a command-line search. Nice for testing and debugging. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java Modified: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java 2008-06-27 00:31:59 UTC (rev 2336) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java 2008-06-28 23:37:51 UTC (rev 2337) @@ -222,13 +222,15 @@ System.out.println( " " + i + " " - + details[i].getValue( "segment" ) + + java.util.Arrays.asList( details[i].getValues( "segment" ) ) + " " - + details[i].getValue( "url") + + java.util.Arrays.asList( details[i].getValues( "url" ) ) + " " - + details[i].getValue( "archive-digest") + + java.util.Arrays.asList( details[i].getValues( "orig" ) ) + " " - + java.util.Arrays.asList( details[i].getValues( "date") ) + + java.util.Arrays.asList( details[i].getValues( "digest" ) ) + + " " + + java.util.Arrays.asList( details[i].getValues( "date" ) ) + "\n" + summaries[i] ); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-06-27 00:31:49
|
Revision: 2336 http://archive-access.svn.sourceforge.net/archive-access/?rev=2336&view=rev Author: binzino Date: 2008-06-26 17:31:59 -0700 (Thu, 26 Jun 2008) Log Message: ----------- Initial revision. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/NutchWaxBean.java 2008-06-27 00:31:59 UTC (rev 2336) @@ -0,0 +1,238 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.nutchwax; + +import java.io.*; +import java.util.*; +import java.lang.reflect.Field; +import javax.servlet.*; + +import org.apache.nutch.searcher.NutchBean; +import org.apache.nutch.searcher.IndexSearcher; +import org.apache.nutch.searcher.Query; +import org.apache.nutch.searcher.HitDetails; +import org.apache.nutch.searcher.Hit; +import org.apache.nutch.searcher.Hits; +import org.apache.nutch.searcher.Summary; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Closeable; +import org.apache.hadoop.conf.Configuration; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.ArchiveParallelReader; +import org.apache.lucene.index.MultiReader; + +import org.apache.nutch.util.HadoopFSUtil; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.indexer.FsDirectory; + +/** + * Utility class to use and extend the NutchBean class for reading + * from parallel indices. + * + * This can be used from the command-line to run test/debug searches, + * the same as NutchBean, but using parallel indices. + * + * NutchWaxBean doesn't extend NutchBean directly since all the good + * stuff inside of NutchBean is declared private. So, we dynamically + * modify a NutchBean instance via reflection to inject our own + * IndexReader that reads from a set of parallel indices. + * + * Before you recoil in horror over this approach, the alternatives + * were none too pretty. Sub-classing won't work since all the + * NutchBean data members are declared private. We could copy the + * NutchBean.java into our own source base and effectively over-write + * the Nutch version when we compile, but that is a maintenance + * headache of extreme magnitude. Plus, we'd probably have to + * copy/past/edit multiple Java source files. + * + * Ideally, Nutch would use some sort of dependency injection system, + * or at least make the NutchBean data members have public get/set + * methods (like a bean should). For now, doing dynamic injection via + * reflection seemed the least obtrusive. + */ +public class NutchWaxBean +{ + + /** + * Static utility class for modifying a NutchBean instance. + */ + public static class NutchBeanModifier + { + /** + * Modify the NutchBean by replacing the IndexReader in its + * IndexSearcher with one we create that uses + * ArchiveParallelReader for searching across parallel indices. + */ + public static void modify( NutchBean bean ) + { + try + { + // First, get the configuration from the bean. Gosh it would be + // nice if NutchBean had a getConf() public method, wouldn't it? + Field fConf = NutchBean.class.getDeclaredField( "conf" ); + fConf.setAccessible( true ); + + // The rest of this code is similar to NutchBean in that it + // looks for a 'pindexes' directory as a sibling of the + // 'indexes' directory that NutchBean finds. + Configuration conf = (Configuration) fConf.get( bean ); + + FileSystem fs = FileSystem.get( conf ); + + Path dir = new Path( conf.get( "searcher.dir", "crawl") ); + + Path indexesDir = new Path( dir, "pindexes" ); + + Path indexDirs[] = fs.listPaths(indexesDir, HadoopFSUtil.getPassDirectoriesFilter(fs)); + + List<IndexReader> readers = new ArrayList<IndexReader>( indexDirs.length ); + + for ( Path indexDir : indexDirs ) + { + Path parallelDirs[] = fs.listPaths( indexDir, HadoopFSUtil.getPassDirectoriesFilter(fs) ); + + if ( parallelDirs.length < 1 ) + { + continue; + } + + ArchiveParallelReader reader = new ArchiveParallelReader( ); + + // Sort the parallelDirs so that we add them in order. Order + // matters to the ParallelReader. + Arrays.sort( parallelDirs ); + + for ( Path p : parallelDirs ) + { + reader.add( IndexReader.open( new FsDirectory( fs, p, false, conf ) ) ); + } + + readers.add( reader ); + } + + MultiReader reader = new MultiReader( readers.toArray( new IndexReader[0] ) ); + + // Now, inject the 'reader' into the NutchBean's IndexSearcher via reflection. + Field fSearcher = NutchBean.class.getDeclaredField( "searcher" ); + Field fReader = IndexSearcher.class.getDeclaredField( "reader" ); + Field fLuceneSearcher = IndexSearcher.class.getDeclaredField( "luceneSearcher" ); + + fSearcher .setAccessible( true ); + fReader .setAccessible( true ); + fLuceneSearcher.setAccessible( true ); + + org.apache.lucene.search.IndexSearcher newLuceneSearcher = new org.apache.lucene.search.IndexSearcher( reader ); + + IndexSearcher searcher = (IndexSearcher) fSearcher.get( bean ); + fLuceneSearcher.set( searcher, newLuceneSearcher ); + fReader .set( searcher, reader ); + } + catch ( Exception e ) + { + throw new RuntimeException( e ); + } + } + } + + /** + * Similar to code in NutchBean. This receives the events from the + * servlet container and modifies the NutchBean instance put there + * by the NutchBeanConstructor listener. For this to work, it must + * be declared after the NutchBeanConstructor in the web.xml file, + * e.g. + * <pre> + * <listener> + * <listener-class>org.apache.nutch.searcher.NutchBean$NutchBeanConstructor</listener-class> + * <listener-class>org.archive.nutchwax.NutchWaxBean$NutchWaxBeanConstructor</listener-class> + * </listener> + * </pre> + */ + public static class NutchWaxBeanConstructor implements ServletContextListener + { + + public void contextDestroyed( ServletContextEvent sce ) + { + } + + public void contextInitialized( ServletContextEvent sce ) + { + ServletContext app = sce.getServletContext(); + NutchBean bean = (NutchBean) app.getAttribute( NutchBean.KEY ); + + if ( bean == null ) + { + NutchBean.LOG.fatal( "No value for \"" + NutchBean.KEY + "\" in servlet context" ); + + return ; + } + + // Modify the NutchBean. + NutchBeanModifier.modify( bean ); + } + + } + + /** + * Simple command-line driver akin to NutchBean.main that peforms + * the ben modification. Useful for testing and debugging from the + * command-line. + */ + public static void main(String[] args) throws Exception + { + String usage = "NutchWaxBean query"; + + if (args.length == 0) + { + System.err.println(usage); + System.exit(-1); + } + + Configuration conf = NutchConfiguration.create(); + + NutchBean bean = new NutchBean(conf); + NutchBeanModifier.modify( bean ); + + Query query = Query.parse(args[0], conf); + Hits hits = bean.search(query, 10); + System.out.println("Total hits: " + hits.getTotal()); + int length = (int)Math.min(hits.getTotal(), 10); + Hit[] show = hits.getHits(0, length); + HitDetails[] details = bean.getDetails(show); + Summary[] summaries = bean.getSummary(details, query); + + for (int i = 0; i < hits.getLength(); i++) + { + // Use a slightly more verbose output than NutchBean. + System.out.println( " " + + i + + " " + + details[i].getValue( "segment" ) + + " " + + details[i].getValue( "url") + + " " + + details[i].getValue( "archive-digest") + + " " + + java.util.Arrays.asList( details[i].getValues( "date") ) + + "\n" + + summaries[i] ); + } + } + + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-06-26 22:39:35
|
Revision: 2335 http://archive-access.svn.sourceforge.net/archive-access/?rev=2335&view=rev Author: binzino Date: 2008-06-26 15:39:42 -0700 (Thu, 26 Jun 2008) Log Message: ----------- Add handling of DateAdder and name change of ArcsToSegment to Importer Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/bin/nutchwax Modified: trunk/archive-access/projects/nutchwax/archive/bin/nutchwax =================================================================== --- trunk/archive-access/projects/nutchwax/archive/bin/nutchwax 2008-06-26 22:38:49 UTC (rev 2334) +++ trunk/archive-access/projects/nutchwax/archive/bin/nutchwax 2008-06-26 22:39:42 UTC (rev 2335) @@ -40,28 +40,23 @@ case "$1" in import) shift - if [ $# -eq 0 ]; then - ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.ArcsToSegment - exit 1 - fi - if [ -z "$2" ]; then - segment=`date +"%Y%m%d%H%M%S"` - segment="segments/${segment}" - else - segment="$2" - fi - ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.ArcsToSegment "$1" "${segment}" + ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.ArchiveImporter $@ ;; + add-dates) + shift + ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.tools.DateAdder $@ + ;; dumpindex) shift - ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.tools.DumpIndex $@ + ${NUTCH_HOME}/bin/nutch org.archive.nutchwax.tools.DumpParallelIndex $@ ;; *) echo "" echo "Usage: nutchwax COMMAND" echo "where COMMAND is one of:" echo " import Import ARCs into a new Nutch segment" - echo " dumpindex Dump an index to the screen" + echo " adddates Add dates to a parallel index" + echo " dumpindex Dump a (parallel) index to the screen" echo "" exit 1 ;; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-06-26 22:38:43
|
Revision: 2334 http://archive-access.svn.sourceforge.net/archive-access/?rev=2334&view=rev Author: binzino Date: 2008-06-26 15:38:49 -0700 (Thu, 26 Jun 2008) Log Message: ----------- Renamed class for clarity of purpose/use. Add handling of digest to enable filtering of duplicates in conjunction with WaybackURLFilter. Moved command-line handling of segments from 'nutchwax' script to here for consistency with other Nutch actions. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java Removed Paths: ------------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/ArcsToSegment.java Deleted: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/ArcsToSegment.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/ArcsToSegment.java 2008-06-26 22:36:45 UTC (rev 2333) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/ArcsToSegment.java 2008-06-26 22:38:49 UTC (rev 2334) @@ -1,553 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.archive.nutchwax; - -import java.io.IOException; -import java.net.MalformedURLException; -import java.util.Map.Entry; -import java.util.Iterator; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.conf.Configured; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.io.WritableComparable; -import org.apache.hadoop.mapred.JobClient; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.Mapper; -import org.apache.hadoop.mapred.OutputCollector; -import org.apache.hadoop.mapred.Reporter; -import org.apache.hadoop.mapred.TextInputFormat; -import org.apache.hadoop.mapred.TextOutputFormat; -import org.apache.hadoop.util.StringUtils; -import org.apache.hadoop.util.Tool; -import org.apache.hadoop.util.ToolRunner; -import org.apache.nutch.crawl.CrawlDatum; -import org.apache.nutch.crawl.NutchWritable; -import org.apache.nutch.crawl.SignatureFactory; -import org.apache.nutch.fetcher.FetcherOutputFormat; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.metadata.Nutch; -import org.apache.nutch.net.URLFilters; -import org.apache.nutch.net.URLFilterException; -import org.apache.nutch.net.URLNormalizers; -import org.apache.nutch.parse.Parse; -import org.apache.nutch.parse.ParseImpl; -import org.apache.nutch.parse.ParseResult; -import org.apache.nutch.parse.ParseStatus; -import org.apache.nutch.parse.ParseText; -import org.apache.nutch.parse.ParseUtil; -import org.apache.nutch.protocol.Content; -import org.apache.nutch.protocol.ProtocolStatus; -import org.apache.nutch.scoring.ScoringFilters; -import org.apache.nutch.util.LogUtil; -import org.apache.nutch.util.NutchConfiguration; -import org.apache.nutch.util.NutchJob; -import org.apache.nutch.util.StringUtil; - -import org.archive.io.ArchiveReader; -import org.archive.io.ArchiveReaderFactory; -import org.archive.io.arc.ARCRecord; -import org.archive.io.arc.ARCRecordMetaData; - - -/** - * Convert Archive files (.arc/.warc) files to a Nutch segment. This - * is sometimes called "importing" other times "converting", the terms - * are equivalent. - * - * <code>ArcsToSegment</code> is coded as a Hadoop job and is intended - * to be run within the Hadoop framework, or at least started by the - * Hadoop launcher incorporated into Nutch. Although there is a - * <code>main</code> driver, the Nutch launcher script is strongly - * recommended. - * - * This class was initially adapted from the Nutch - * <code>Fetcher</code> class. The premise is since the Nutch - * fetching process acquires external content and places it in a Nutch - * segment, we can perform a similar activity by taking content from - * the ARC files and place that content in a Nutch segment in a - * similar fashion. Ideally, once the <code>ArcsToSegment</code> is - * used to import a set of ARCs into a Nutch segment, the resulting - * segment should be more-or-less the same as one created by Nutch's - * own Fetcher. - * - * Since we are mimicing the Nutch Fetcher, we have to be careful - * about some implementation details that might not seem relevant - * to the importing of ARC files. I've noted those details with - * comments prefaced with "?:". - */ -public class ArcsToSegment extends Configured implements Tool, Mapper -{ - - public static final Log LOG = LogFactory.getLog( ArcsToSegment.class ); - - private JobConf jobConf; - private URLFilters urlFilters; - private ScoringFilters scfilters; - private ParseUtil parseUtil; - private URLNormalizers normalizers; - private int interval; - - private long numSkipped; - private long numImported; - private long bytesSkipped; - private long bytesImported; - - /** - * ?: Is this necessary? - */ - public ArcsToSegment() - { - - } - - /** - * <p>Constructor that sets the job configuration.</p> - * - * @param conf - */ - public ArcsToSegment( Configuration conf ) - { - setConf( conf ); - } - - /** - * <p>Configures the job. Sets the url filters, scoring filters, url normalizers - * and other relevant data.</p> - * - * @param job The job configuration. - */ - public void configure( JobConf job ) - { - // set the url filters, scoring filters the parse util and the url - // normalizers - this.jobConf = job; - this.urlFilters = new URLFilters ( jobConf ); - this.scfilters = new ScoringFilters( jobConf ); - this.parseUtil = new ParseUtil ( jobConf ); - this.normalizers = new URLNormalizers( jobConf, URLNormalizers.SCOPE_FETCHER ); - this.interval = jobConf.getInt( "db.fetch.interval.default", 2592000 ); - } - - /** - * In Mapper interface. - * @inherit - */ - public void close() - { - - } - - /** - * <p>Runs the Map job to translate an arc file into output for Nutch - * segments.</p> - * - * @param key Line number in manifest corresponding to the <code>value</code> - * @param value A line from the manifest - * @param output The output collecter. - * @param reporter The progress reporter. - */ - public void map( WritableComparable key, - Writable value, - OutputCollector output, - Reporter reporter ) - throws IOException - { - String arcUrl = ""; - String collection = ""; - String segmentName = getConf().get( Nutch.SEGMENT_NAME_KEY ); - - // Each line of the manifest is "<url> <collection>" where <collection> is optional - String[] line = value.toString().split( " " ); - arcUrl = line[0]; - - if ( line.length > 1 ) - { - collection = line[1]; - } - - if ( LOG.isInfoEnabled() ) LOG.info( "Importing ARC: " + arcUrl ); - - ArchiveReader r = ArchiveReaderFactory.get( arcUrl ); - - ArcReader reader = new ArcReader( r ); - - try - { - for ( ARCRecord record : reader ) - { - // When reading WARC files, records of type other than - // "response" are returned as 'null' by the Iterator, so - // we skip them. - if ( record == null ) continue ; - - importRecord( record, segmentName, collection, output ); - - // FIXME: What does this do exactly? - reporter.progress(); - } - } - finally - { - r.close(); - - if ( LOG.isInfoEnabled() ) - { - LOG.info( "Completed ARC: " + arcUrl ); - LOG.info( "URLs skipped : " + this.numSkipped ); - LOG.info( "URLs imported: " + this.numImported ); - LOG.info( "URLs total : " + ( this.numSkipped + this.numImported ) ); - } - } - - } - - /** - * Import an ARCRecord. - * - * @param record - * @param segmentName - * @param collectionName - * @param output - * @return whether record was imported or not (i.e. filtered out due to URL filtering rules, etc.) - */ - private boolean importRecord( ARCRecord record, String segmentName, String collectionName, OutputCollector output ) - { - ARCRecordMetaData meta = record.getMetaData(); - - if ( LOG.isInfoEnabled() ) LOG.info( "Consider URL: " + meta.getUrl() + " (" + meta.getMimetype() + ")" ); - - /* ?: On second thought, DON'T do this. Even if we don't have a - parser registered for a content-type, we still want to index - its URL and possibly other meta-data. - */ - /* - // First, check to see if we have a parser registered for the - // URL's Content-Type, so we don't read in some huge video file - // only to discover we don't have a parser for it. - if ( ! this.hasRegisteredParser( meta.getMimetype() ) ) - { - if ( LOG.isInfoEnabled() ) LOG.info( "No parser registered for: " + meta.getMimetype() ); - - this.numSkipped++; - this.bytesSkipped += meta.getLength(); - - return false ; - } - */ - - // ?: Arguably, we shouldn't be normalizing nor filtering based - // on the URL. If the document made it into the (W)ARC file, then - // it should be indexed. But then again, the normalizers and - // filters can be disabled in the Nutch configuration files. - String url = this.normalizeAndFilterUrl( meta.getUrl() ); - - if ( url == null ) - { - if ( LOG.isInfoEnabled() ) LOG.info( "Skip URL: " + meta.getUrl() ); - - this.numSkipped++; - this.bytesSkipped += meta.getLength(); - - return false; - } - - // URL is good, let's import the content. - if ( LOG.isInfoEnabled() ) LOG.info( "Import URL: " + meta.getUrl() ); - this.numImported++; - this.bytesImported += meta.getLength(); - - try - { - // Skip the HTTP headers in the response body, so that the - // parsers are parsing the reponse body and not the HTTP - // headers. - record.skipHttpHeader(); - - // Read the bytes of the HTTP response - byte[] bytes = new byte[(int) meta.getLength()]; - record.read( bytes ); - - Metadata contentMetadata = new Metadata( ); - // Set the segment name, just as is done by standard Nutch fetching. - // Then, add the NutchWAX-specific metadata fields. - contentMetadata.set( Nutch .SEGMENT_NAME_KEY, segmentName ); - - contentMetadata.set( NutchWax.CONTENT_TYPE_KEY, meta.getMimetype() ); - contentMetadata.set( NutchWax.ARCNAME_KEY, meta.getArcFile().getName() ); - contentMetadata.set( NutchWax.COLLECTION_KEY, collectionName ); - contentMetadata.set( NutchWax.DATE_KEY, meta.getDate() ); - - Content content = new Content( url, url, bytes, meta.getMimetype(), contentMetadata, getConf() ); - - output( output, new Text( url ), content ); - - return true; - } - catch ( Throwable t ) - { - LOG.error( "Import fail : " + meta.getUrl( ), t ); - } - - return false; - } - - /** - * Normalize and filter ther URL. If the URL is malformed or - * filtered (according to registered Nutch URL filtering plugins), - * return <code>null</code>. Otherwise return the normalized URL. - * - * @param candidateUrl to be normalized and filtered - * @return normalized URL, <code>null</code> if malformed or filtered out - */ - private String normalizeAndFilterUrl( String candidateUrl ) - { - String url = null; - try - { - url = normalizers.normalize( candidateUrl, URLNormalizers.SCOPE_FETCHER ); - - url = urlFilters.filter( url ); - - return url; - } - catch ( MalformedURLException mue ) - { - if ( LOG.isInfoEnabled() ) LOG.info( "MalformedURL: " + candidateUrl ); - } - catch ( URLFilterException ufe ) - { - if ( LOG.isInfoEnabled() ) LOG.info( "URL filtered: " + candidateUrl ); - } - - return null; - } - - /** - * TODO: Add check for registered parser for URL's Content-Type. - * The idea is to see if there is a registered parser *before* - * reading all the bytes of the content. This way, if we have a - * 100MB .mp4 movie file, but no parser registered for it, we don't - * bother reading in the 100MB body. - * - * Right now, the ParseUtil doesn't have a hasParser(ContentType) - * method, so we have to read in the entire content body then try to - * parse it just to discover if it is parsable or not. - * - * Another option is to create a fake Content object with the same - * Content-Type as the real content and then try parsing the fake - * Cotnent object to see if a parser was found for it or not. But - * that seems pretty hokey. - */ - private boolean hasRegisteredParser( String contentType ) - { - /* The following would be nice if such a method existed... - - return this.parseUtil.hasParser( contentType ); - */ - return true; - } - - /** - * - */ - private void output( OutputCollector output, - Text key, - Content content ) - { - // Create the datum - CrawlDatum datum = new CrawlDatum( CrawlDatum.STATUS_FETCH_SUCCESS, this.interval, 1.0f ); - - // ?: I have no idea why we need to store the ProtocolStatus in - // the datum's metadata, but the Nutch Fetcher class does it and - // it seems important. Since we're not really fetching here, we - // assume ProtocolStatus.STATUS_SUCCESS is the right thing to do. - datum.getMetaData().put( Nutch.WRITABLE_PROTO_STATUS_KEY, ProtocolStatus.STATUS_SUCCESS ); - - // ?: Since we store the ARCRecord's archival date in the Content object, we follow the - // logic in Nutch's Fetcher and store the current import time/date in the Datum. I have - // no idea if it makes a difference, other than this value is stored in the "tstamp" - // field in the Lucene index whereas the ARCRecord date is stored in the "date" field - // we added above. - datum.setFetchTime( System.currentTimeMillis() ); - - // ?: It doesn't seem to me that we need/use the scoring stuff - // one way or another, but we might as well leave it in. - try - { - scfilters.passScoreBeforeParsing( key, datum, content ); - } - catch ( Exception e ) - { - if ( LOG.isWarnEnabled() ) LOG.warn( "Couldn't pass score before parsing for: " + key, e ); - } - - // ?: This is kind of interesting. In the Nutch Fetcher class, if the parsing fails, - // the Content is not added to the output. But in ArcsToSegment, we still add it, even - // if the parsing fails. Why? - // - // One benefit is that even if the parsing fails, having the Content in the index still - // allows us to find the document by URL, date, etc. - // - // However, I don't know what will happen when a summary is computed...if the Content isn't there, will - // it fail or just return an empty summary? - ParseResult parseResult = null; - try - { - parseResult = this.parseUtil.parse( content ); - } - catch ( Exception e ) - { - LOG.warn( "Error parsing: " + key, e ); - } - - // ?: This is taken from Nutch Fetcher. I believe the signatures are used in the Fetcher - // to ensure that URL contents are not stored multiple times if the signature doesn't change. - // Makes sense. But, in our case, we're relying on the (W)ARC production tools to eliminate - // duplicate data (or are we?), so how important is the signature for our purposes? - // I'll go ahead and leave it in, in case it's needed by Nutch for unknown purposes. - // - // Also, since we still import documents even if the parsing fails, we compute a signature - // using an "empty" Parse object in the case of parse failure. I don't know why we create - // an empty Parse object rather than just use 'null', but I'm copying the way the Fetcher - // does it. - // - // One odd thing is that we add the signature to the datum here, then "collect" the datum - // just below, but then after collecting the datum, we update the signature when processing - // the ParseResults. I guess "collecting" doesn't write out the datum, but "collects" it for - // later output, thus we can update it after collection (I guess). - if ( parseResult == null ) - { - byte[] signature = SignatureFactory.getSignature( getConf() ).calculate( content, new ParseStatus().getEmptyParse( getConf() ) ); - datum.setSignature( signature ); - } - - try - { - output.collect( key, new NutchWritable( datum ) ); - output.collect( key, new NutchWritable( content ) ); - - if ( parseResult != null ) - { - for ( Entry<Text, Parse> entry : parseResult ) - { - Text url = entry.getKey(); - Parse parse = entry.getValue(); - ParseStatus parseStatus = parse.getData().getStatus(); - - if ( !parseStatus.isSuccess() ) - { - LOG.warn( "Error parsing: " + key + ": " + parseStatus ); - parse = parseStatus.getEmptyParse(getConf()); - } - - byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse); - - // ?: Why bother setting this one again? According to ParseData Javadoc, - // the getContentMeta() returns the original Content metadata object, so - // why are we setting the segment name on it to the same value again? - // Let's leave it out. - // parse.getData().getContentMeta().set( Nutch.SEGMENT_NAME_KEY, segmentName ); - - // ?: These two are copied from Nutch's Fetcher implementation. - parse.getData().getContentMeta().set( Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature) ); - parse.getData().getContentMeta().set( Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime() ) ); - - if ( url.equals( key ) ) - { - datum.setSignature( signature ); - } - - // ?: As above, we'll leave the scoring hooks in place. - try - { - scfilters.passScoreAfterParsing( url, content, parse ); - } - catch ( Exception e ) - { - if ( LOG.isWarnEnabled() ) LOG.warn( "Couldn't pass score, url = " + key, e ); - } - - output.collect( url, new NutchWritable( new ParseImpl( new ParseText( parse.getText() ), parse.getData(), parse.isCanonical() ) ) ); - } - } - } - catch ( Exception e ) - { - LOG.error( "Error outputting Nutch record for: " + key, e ); - } - } - - /** - * - */ - public int run( String[] args ) throws Exception - { - String usage = "Usage: ArcsToSegment <manifestPath> <segmentPath>"; - - if ( args.length < 2 ) - { - System.err.println( usage ); - return -1; - } - - Path manifestPath = new Path( args[0] ); - Path segmentPath = new Path( args[1] ); - - JobConf job = new NutchJob( getConf() ); - - try - { - job.setJobName( "ArcsToSegment " + manifestPath ); - job.set( Nutch.SEGMENT_NAME_KEY, segmentPath.getName() ); - - job.setInputPath ( manifestPath); - job.setInputFormat( TextInputFormat.class ); - - job.setMapperClass( ArcsToSegment.class ); - - job.setOutputPath ( segmentPath ); - job.setOutputFormat ( FetcherOutputFormat.class ); - job.setOutputKeyClass ( Text.class ); - job.setOutputValueClass( NutchWritable.class ); - - JobClient.runJob( job ); - } - catch ( Exception e ) - { - LOG.fatal( "ArcsToSegment: ", e ); - return -1; - } - - return 0; - } - - /** - * - */ - public static void main(String args[]) throws Exception - { - int result = ToolRunner.run( NutchConfiguration.create(), new ArcsToSegment(), args ); - - System.exit( result ); - } - -} Copied: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java (from rev 2289, trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/ArcsToSegment.java) =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/Importer.java 2008-06-26 22:38:49 UTC (rev 2334) @@ -0,0 +1,570 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.nutchwax; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.util.Map.Entry; +import java.util.Iterator; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableComparable; +import org.apache.hadoop.mapred.JobClient; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.Mapper; +import org.apache.hadoop.mapred.OutputCollector; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.TextInputFormat; +import org.apache.hadoop.mapred.TextOutputFormat; +import org.apache.hadoop.util.StringUtils; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.NutchWritable; +import org.apache.nutch.crawl.SignatureFactory; +import org.apache.nutch.fetcher.FetcherOutputFormat; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.net.URLFilters; +import org.apache.nutch.net.URLFilterException; +import org.apache.nutch.net.URLNormalizers; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseResult; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.parse.ParseText; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.ProtocolStatus; +import org.apache.nutch.scoring.ScoringFilters; +import org.apache.nutch.util.LogUtil; +import org.apache.nutch.util.NutchConfiguration; +import org.apache.nutch.util.NutchJob; +import org.apache.nutch.util.StringUtil; + +import org.archive.io.ArchiveReader; +import org.archive.io.ArchiveReaderFactory; +import org.archive.io.ArchiveRecordHeader; +import org.archive.io.arc.ARCRecord; +import org.archive.io.arc.ARCRecordMetaData; +import org.archive.io.warc.WARCConstants; + + +/** + * Convert Archive files (.arc/.warc) files to a Nutch segment. This + * is sometimes called "importing" other times "converting", the terms + * are equivalent. + * + * <code>Importer</code> is coded as a Hadoop job and is intended + * to be run within the Hadoop framework, or at least started by the + * Hadoop launcher incorporated into Nutch. Although there is a + * <code>main</code> driver, the Nutch launcher script is strongly + * recommended. + * + * This class was initially adapted from the Nutch + * <code>Fetcher</code> class. The premise is since the Nutch + * fetching process acquires external content and places it in a Nutch + * segment, we can perform a similar activity by taking content from + * the ARC files and place that content in a Nutch segment in a + * similar fashion. Ideally, once the <code>Importer</code> is + * used to import a set of ARCs into a Nutch segment, the resulting + * segment should be more-or-less the same as one created by Nutch's + * own Fetcher. + * + * Since we are mimicing the Nutch Fetcher, we have to be careful + * about some implementation details that might not seem relevant + * to the importing of ARC files. I've noted those details with + * comments prefaced with "?:". + */ +public class Importer extends Configured implements Tool, Mapper +{ + + public static final Log LOG = LogFactory.getLog( Importer.class ); + + private JobConf jobConf; + private URLFilters urlFilters; + private ScoringFilters scfilters; + private ParseUtil parseUtil; + private URLNormalizers normalizers; + private int interval; + + private long numSkipped; + private long numImported; + private long bytesSkipped; + private long bytesImported; + + /** + * ?: Is this necessary? + */ + public Importer() + { + + } + + /** + * <p>Constructor that sets the job configuration.</p> + * + * @param conf + */ + public Importer( Configuration conf ) + { + setConf( conf ); + } + + /** + * <p>Configures the job. Sets the url filters, scoring filters, url normalizers + * and other relevant data.</p> + * + * @param job The job configuration. + */ + public void configure( JobConf job ) + { + // set the url filters, scoring filters the parse util and the url + // normalizers + this.jobConf = job; + this.urlFilters = new URLFilters ( jobConf ); + this.scfilters = new ScoringFilters( jobConf ); + this.parseUtil = new ParseUtil ( jobConf ); + this.normalizers = new URLNormalizers( jobConf, URLNormalizers.SCOPE_FETCHER ); + this.interval = jobConf.getInt( "db.fetch.interval.default", 2592000 ); + } + + /** + * In Mapper interface. + * @inherit + */ + public void close() + { + + } + + /** + * <p>Runs the Map job to translate an arc file into output for Nutch + * segments.</p> + * + * @param key Line number in manifest corresponding to the <code>value</code> + * @param value A line from the manifest + * @param output The output collecter. + * @param reporter The progress reporter. + */ + public void map( final WritableComparable key, + final Writable value, + final OutputCollector output, + final Reporter reporter ) + throws IOException + { + String arcUrl = ""; + String collection = ""; + String segmentName = getConf().get( Nutch.SEGMENT_NAME_KEY ); + + // Each line of the manifest is "<url> <collection>" where <collection> is optional + String[] line = value.toString().split( "\\s+" ); + arcUrl = line[0]; + + if ( line.length > 1 ) + { + collection = line[1]; + } + + if ( LOG.isInfoEnabled() ) LOG.info( "Importing ARC: " + arcUrl ); + + ArchiveReader r = ArchiveReaderFactory.get( arcUrl ); + r.setDigest( true ); + + ArcReader reader = new ArcReader( r ); + + try + { + for ( ARCRecord record : reader ) + { + // When reading WARC files, records of type other than + // "response" are returned as 'null' by the Iterator, so + // we skip them. + if ( record == null ) continue ; + + importRecord( record, segmentName, collection, output ); + + reporter.progress(); + } + } + finally + { + r.close(); + + if ( LOG.isInfoEnabled() ) + { + LOG.info( "Completed ARC: " + arcUrl ); + } + } + + } + + /** + * Import an ARCRecord. + * + * @param record + * @param segmentName + * @param collectionName + * @param output + * @return whether record was imported or not (i.e. filtered out due to URL filtering rules, etc.) + */ + private boolean importRecord( ARCRecord record, String segmentName, String collectionName, OutputCollector output ) + { + ARCRecordMetaData meta = record.getMetaData(); + + if ( LOG.isInfoEnabled() ) LOG.info( "Consider URL: " + meta.getUrl() + " (" + meta.getMimetype() + ")" ); + + try + { + // Skip the HTTP headers in the response body, so that the + // parsers are parsing the reponse body and not the HTTP + // headers. + record.skipHttpHeader(); + + // TODO: Put in a size limiter, akin to Nutch's file.content.limit. + + // Read the bytes of the HTTP response + byte[] bytes = new byte[(int) meta.getLength()]; + + // NOTE: Do not use read(byte[]) because ArchiveRecord does NOT over-ride + // the implementation inherited from InputStream. And since it does + // not over-ride it, it won't do the digesting on it. Must use either + // read(byte[],offset,length) or read(). + record.read( bytes, 0, bytes.length ); + + // Must call close() for digest calculation to be finished. + if ( meta.getDigest() == null ) + { + record.close(); + + // This is a bit hacky, but ARC and WARC records produce + // two slightly different digest formats. WARC record + // digests have the algorithm name as a prefix, such as + // "sha1:PD3SS4WWZVFWTDC63RU2MWX7BVC2Y2VA" but the + // ArcRecord.getDigestStr() does not. Since we want the + // formats to match, we prepend the "sha1:" prefix to ARC + // record digest. + meta.setDigest( "sha1:" + record.getDigestStr() ); + } + + // Normalize and filter + String url = this.normalizeAndFilterUrl( meta.getUrl(), meta.getDigest(), meta.getDate() ); + + if ( url == null ) + { + if ( LOG.isInfoEnabled() ) LOG.info( "Skip URL: " + meta.getUrl() ); + return false; + } + + Metadata contentMetadata = new Metadata(); + // Set the segment name, just as is done by standard Nutch fetching. + // Then, add the NutchWAX-specific metadata fields. + contentMetadata.set( Nutch .SEGMENT_NAME_KEY, segmentName ); + + contentMetadata.set( NutchWax.CONTENT_TYPE_KEY, meta.getMimetype() ); + contentMetadata.set( NutchWax.ARCNAME_KEY, meta.getArcFile().getName() ); + contentMetadata.set( NutchWax.COLLECTION_KEY, collectionName ); + contentMetadata.set( NutchWax.DATE_KEY, meta.getDate() ); + contentMetadata.set( NutchWax.DIGEST_KEY, meta.getDigest() ); + + Content content = new Content( url, url, bytes, meta.getMimetype(), contentMetadata, getConf() ); + + output( output, new Text( url ), content ); + + return true; + } + catch ( Throwable t ) + { + LOG.error( "Import fail : " + meta.getUrl(), t ); + } + + return false; + } + + /** + * Normalize and filter the URL. If the URL is malformed or + * filtered (according to registered Nutch URL filtering plugins), + * return <code>null</code>. Otherwise return the normalized URL. + * + * @param candidateUrl to be normalized and filtered + * @param digest of URL content + * @param date of URL capture + * @return normalized URL, <code>null</code> if malformed or filtered out + */ + private String normalizeAndFilterUrl( String candidateUrl, String digest, String date ) + { + String url = null; + try + { + url = normalizers.normalize( candidateUrl, URLNormalizers.SCOPE_FETCHER ); + + if ( urlFilters.filter( url + " " + digest + " " + date ) != null ) + { + return url; + } + } + catch ( MalformedURLException mue ) + { + if ( LOG.isInfoEnabled() ) LOG.info( "MalformedURL: " + candidateUrl ); + } + catch ( URLFilterException ufe ) + { + if ( LOG.isInfoEnabled() ) LOG.info( "URL filtered: " + candidateUrl ); + } + + return null; + } + + /** + * + */ + private void output( OutputCollector output, + Text key, + Content content ) + { + // Create the datum + CrawlDatum datum = new CrawlDatum( CrawlDatum.STATUS_FETCH_SUCCESS, this.interval, 1.0f ); + + // ?: I have no idea why we need to store the ProtocolStatus in + // the datum's metadata, but the Nutch Fetcher class does it and + // it seems important. Since we're not really fetching here, we + // assume ProtocolStatus.STATUS_SUCCESS is the right thing to do. + datum.getMetaData().put( Nutch.WRITABLE_PROTO_STATUS_KEY, ProtocolStatus.STATUS_SUCCESS ); + + // ?: Since we store the ARCRecord's archival date in the Content object, we follow the + // logic in Nutch's Fetcher and store the current import time/date in the Datum. I have + // no idea if it makes a difference, other than this value is stored in the "tstamp" + // field in the Lucene index whereas the ARCRecord date is stored in the "date" field + // we added above. + datum.setFetchTime( System.currentTimeMillis() ); + + // ?: It doesn't seem to me that we need/use the scoring stuff + // one way or another, but we might as well leave it in. + try + { + scfilters.passScoreBeforeParsing( key, datum, content ); + } + catch ( Exception e ) + { + if ( LOG.isWarnEnabled() ) LOG.warn( "Couldn't pass score before parsing for: " + key, e ); + } + + // ?: This is kind of interesting. In the Nutch Fetcher class, if the parsing fails, + // the Content is not added to the output. But in Importer, we still add it, even + // if the parsing fails. Why? + // + // One benefit is that even if the parsing fails, having the Content in the index still + // allows us to find the document by URL, date, etc. + // + // However, I don't know what will happen when a summary is computed...if the Content isn't there, will + // it fail or just return an empty summary? + ParseResult parseResult = null; + try + { + parseResult = this.parseUtil.parse( content ); + } + catch ( Exception e ) + { + LOG.warn( "Error parsing: " + key, e ); + } + + // ?: This is taken from Nutch Fetcher. I believe the signatures are used in the Fetcher + // to ensure that URL contents are not stored multiple times if the signature doesn't change. + // Makes sense. But, in our case, we're relying on the (W)ARC production tools to eliminate + // duplicate data (or are we?), so how important is the signature for our purposes? + // I'll go ahead and leave it in, in case it's needed by Nutch for unknown purposes. + // + // Also, since we still import documents even if the parsing fails, we compute a signature + // using an "empty" Parse object in the case of parse failure. I don't know why we create + // an empty Parse object rather than just use 'null', but I'm copying the way the Fetcher + // does it. + // + // One odd thing is that we add the signature to the datum here, then "collect" the datum + // just below, but then after collecting the datum, we update the signature when processing + // the ParseResults. I guess "collecting" doesn't write out the datum, but "collects" it for + // later output, thus we can update it after collection (I guess). + if ( parseResult == null ) + { + byte[] signature = SignatureFactory.getSignature( getConf() ).calculate( content, new ParseStatus().getEmptyParse( getConf() ) ); + datum.setSignature( signature ); + } + + try + { + output.collect( key, new NutchWritable( datum ) ); + output.collect( key, new NutchWritable( content ) ); + + if ( parseResult != null ) + { + for ( Entry<Text, Parse> entry : parseResult ) + { + Text url = entry.getKey(); + Parse parse = entry.getValue(); + ParseStatus parseStatus = parse.getData().getStatus(); + + if ( !parseStatus.isSuccess() ) + { + LOG.warn( "Error parsing: " + key + ": " + parseStatus ); + parse = parseStatus.getEmptyParse(getConf()); + } + + byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse); + + // ?: Why bother setting this one again? According to ParseData Javadoc, + // the getContentMeta() returns the original Content metadata object, so + // why are we setting the segment name on it to the same value again? + // Let's leave it out. + // parse.getData().getContentMeta().set( Nutch.SEGMENT_NAME_KEY, segmentName ); + + // ?: These two are copied from Nutch's Fetcher implementation. + parse.getData().getContentMeta().set( Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature) ); + parse.getData().getContentMeta().set( Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime() ) ); + + if ( url.equals( key ) ) + { + datum.setSignature( signature ); + } + + // ?: As above, we'll leave the scoring hooks in place. + try + { + scfilters.passScoreAfterParsing( url, content, parse ); + } + catch ( Exception e ) + { + if ( LOG.isWarnEnabled() ) LOG.warn( "Couldn't pass score, url = " + key, e ); + } + + output.collect( url, new NutchWritable( new ParseImpl( new ParseText( parse.getText() ), parse.getData(), parse.isCanonical() ) ) ); + } + } + } + catch ( Exception e ) + { + LOG.error( "Error outputting Nutch record for: " + key, e ); + } + } + + /** + * + */ + public int run( String[] args ) throws Exception + { + if ( args.length < 1 ) + { + usage( ); + return -1; + } + + JobConf job = new NutchJob( getConf() ); + + // Check for "-e <exclusions>" option. + int pos = 0; + if ( args[0].equals( "-e" ) ) + { + if ( args.length < 2 ) + { + System.err.println( "ERROR: Missing filename for option \"-e\"\n" ); + usage( ); + return -1; + } + + job.set( "nutchwax.urlfilter.wayback.exclusions", args[1] ); + + pos = 2; + } + + if ( args.length - pos < 1 ) + { + System.err.println( "ERROR: Missing manifest file.\n" ); + usage( ); + return -1; + } + + Path manifestPath = new Path( args[pos++] ); + + Path segmentPath; + if ( args.length - pos < 1 ) + { + segmentPath = new Path( "segments", org.apache.nutch.crawl.Generator.generateSegmentName( ) ); + } + else + { + segmentPath = new Path( args[pos] ); + } + + try + { + job.setJobName( "Importer " + manifestPath ); + job.set( Nutch.SEGMENT_NAME_KEY, segmentPath.getName() ); + + job.setInputPath ( manifestPath); + job.setInputFormat( TextInputFormat.class ); + + job.setMapperClass( Importer.class ); + + job.setOutputPath ( segmentPath ); + job.setOutputFormat ( FetcherOutputFormat.class ); + job.setOutputKeyClass ( Text.class ); + job.setOutputValueClass( NutchWritable.class ); + + JobClient.runJob( job ); + } + catch ( Exception e ) + { + LOG.fatal( "Importer: ", e ); + return -1; + } + + return 0; + } + + /** + * + */ + public void usage( ) + { + String usage = + "Usage: Importer [opts] <manifest> [<segment>]\n" + + "Options:\n" + + " -e filename Exclusions file, over-rides configuration property.\n" + + "\n" + + "If <segment> not specified, a pathname will be automatically generated\n" + + "based on current time in sub-directory 'segments', which is created if\n" + + "necessary. This is to mirror the behavior of other Nutch actions.\n" + ; + + System.err.println( usage ); + } + + /** + * + */ + public static void main( String args[] ) throws Exception + { + int result = ToolRunner.run( NutchConfiguration.create(), new Importer(), args ); + + System.exit( result ); + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-06-26 22:36:36
|
Revision: 2333 http://archive-access.svn.sourceforge.net/archive-access/?rev=2333&view=rev Author: binzino Date: 2008-06-26 15:36:45 -0700 (Thu, 26 Jun 2008) Log Message: ----------- Initial revision. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DateAdder.java 2008-06-26 22:36:45 UTC (rev 2333) @@ -0,0 +1,169 @@ +/* + * Copyright (C) 2008 Internet Archive. + * + * This file is part of the archive-access tools project + * (http://sourceforge.net/projects/archive-access). + * + * The archive-access tools are free software; you can redistribute them and/or + * modify them under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or any + * later version. + * + * The archive-access tools are distributed in the hope that they will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser + * Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License along with + * the archive-access tools; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.nutchwax.tools; + +import java.io.File; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.Map; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; +import java.util.Collections; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.analysis.WhitespaceAnalyzer; + +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; + +/** + * Reads series of (digest+URL,date) lines, finds corresponding + * document in index, and adds the date to it. + */ +public class DateAdder +{ + + + public static void main(String[] args) + throws Exception + { + if ( args.length < 4 ) + { + System.out.println( "DateAdder <key-index> <source1> ... <sourceN> <dest> <records>" ); + System.exit( 0 ); + } + + String mainIndexDir = args[0].trim(); + String destIndexDir = args[args.length - 2].trim(); + String recordsFile = args[args.length - 1].trim(); + + InputStream recordsStream; + if ( "-".equals( recordsFile ) ) + { + recordsStream = System.in; + } + else + { + recordsStream = new FileInputStream( recordsFile ); + } + + // Read date-addition records from stdin. + Map<String,String> dateRecords = new HashMap<String,String>( ); + BufferedReader br = new BufferedReader( new InputStreamReader( recordsStream, "UTF-8" ) ); + String line; + while ( (line = br.readLine( )) != null ) + { + String parts[] = line.split("\\s+"); + if ( parts.length != 3 ) + { + System.out.println( "Malformed line: " + line ); + continue; + } + + // Key is hash+url, value is String which is a " "-separated list of dates + String key = parts[0] + parts[1]; + String dates = dateRecords.get( key ); + if ( dates != null ) + { + dates += " " + parts[2]; + dateRecords.put( key, dates ); + } + else + { + dateRecords.put( key , parts[2] ); + } + + } + + IndexReader reader = IndexReader.open( mainIndexDir ); + + IndexReader sourceReaders[] = new IndexReader[args.length-3]; + for ( int i = 0 ; i < sourceReaders.length ; i++ ) + { + sourceReaders[i] = IndexReader.open( args[i+1] ); + } + + IndexWriter writer = new IndexWriter( destIndexDir, new WhitespaceAnalyzer( ), true ); + + UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer( ); + + for ( int i = 0 ; i < reader.numDocs( ) ; i++ ) + { + Document oldDoc = reader.document( i ); + Document newDoc = new Document( ); + + // Copy the source values to the new document. + /* + String dates[] = oldDoc.getValues( "date" ); + + if ( dates != null ) + { + for ( String date : dates ) + { + newDoc.add( new Field( "date", date, Field.Store.YES, Field.Index.UN_TOKENIZED ) ); + } + } + */ + Set<String> uniqueDates = new HashSet<String>( ); + for ( IndexReader source : sourceReaders ) + { + Document sourceDoc = source.document( i ); + + String dates[] = sourceDoc.getValues( "date" ); + + java.util.Collections.addAll( uniqueDates, dates ); + } + for ( String date : uniqueDates ) + { + newDoc.add( new Field( "date", date, Field.Store.YES, Field.Index.UN_TOKENIZED ) ); + } + + // First, apply URL canonicalization from Wayback + String canonicalizedUrl = canonicalizer.urlStringToKey( oldDoc.get( "url" ) ); + + // Now, get the digest+ URL of the document, look for it in + // the updateRecords and if found, add the date. + String key = canonicalizedUrl + oldDoc.get( "archive-digest" ); + + String newDates = dateRecords.get( key ); + if ( newDates != null ) + { + for ( String date : newDates.split("\\s+") ) + { + newDoc.add( new Field( "date", date, Field.Store.YES, Field.Index.UN_TOKENIZED ) ); + } + } + + writer.addDocument( newDoc ); + } + + reader.close( ); + writer.close( ); + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-06-26 22:36:11
|
Revision: 2332 http://archive-access.svn.sourceforge.net/archive-access/?rev=2332&view=rev Author: binzino Date: 2008-06-26 15:36:20 -0700 (Thu, 26 Jun 2008) Log Message: ----------- Initial revision. Added Paths: ----------- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DumpParallelIndex.java Added: trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DumpParallelIndex.java =================================================================== --- trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DumpParallelIndex.java (rev 0) +++ trunk/archive-access/projects/nutchwax/archive/src/java/org/archive/nutchwax/tools/DumpParallelIndex.java 2008-06-26 22:36:20 UTC (rev 2332) @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2008 Internet Archive. + * + * This file is part of the archive-access tools project + * (http://sourceforge.net/projects/archive-access). + * + * The archive-access tools are free software; you can redistribute them and/or + * modify them under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or any + * later version. + * + * The archive-access tools are distributed in the hope that they will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser + * Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License along with + * the archive-access tools; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.nutchwax.tools; + +import java.io.File; +import java.util.Iterator; +import java.util.Arrays; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.ArchiveParallelReader; + +public class DumpParallelIndex +{ + public static void main( String[] args ) throws Exception + { + String option = ""; + String indexDir = ""; + + if ( args.length < 1 ) + { + usageAndExit( ); + } + + int offset = 0; + if ( args[0].equals( "-f" ) ) + { + offset = 1; + } + + String dirs[] = new String[args.length - offset]; + System.arraycopy( args, offset, dirs, 0, args.length - offset ); + + ArchiveParallelReader reader = new ArchiveParallelReader( ); + for ( String dir : dirs ) + { + reader.add( IndexReader.open( dir ) ); + } + + if ( offset > 0 ) + { + listFields( reader ); + } + else + { + dumpIndex( reader ); + } + } + + private static void dumpIndex( IndexReader reader ) throws Exception + { + Object[] fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL).toArray(); + + for (int i = 0; i < fieldNames.length; i++) + { + System.out.print(fieldNames[i] + "\t"); + } + + System.out.println(); + + int numDocs = reader.numDocs(); + + for (int i = 0; i < numDocs; i++) + { + for (int j = 0; j < fieldNames.length; j++) + { + System.out.print( Arrays.toString( reader.document(i).getValues((String) fieldNames[j])) + "\t" ); + } + + System.out.println(); + } + } + + private static void listFields( IndexReader reader ) throws Exception + { + Iterator it = reader.getFieldNames(IndexReader.FieldOption.ALL).iterator(); + + while (it.hasNext()) + { + System.out.println(it.next()); + } + + reader.close(); + } + + private static void usageAndExit() + { + System.out.println("Usage: DumpParallelIndex [-f] index1 ... indexN"); + System.exit(1); + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bi...@us...> - 2008-06-26 22:35:23
|
Revision: 2331 http://archive-access.svn.sourceforge.net/archive-access/?rev=2331&view=rev Author: binzino Date: 2008-06-26 15:35:33 -0700 (Thu, 26 Jun 2008) Log Message: ----------- Add WaybackURLFilter configuration. Add archive-digest field to indexing and query plugins configurations. Modified Paths: -------------- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml Modified: trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml =================================================================== --- trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml 2008-06-26 22:34:24 UTC (rev 2330) +++ trunk/archive-access/projects/nutchwax/archive/conf/nutch-site.xml 2008-06-26 22:35:33 UTC (rev 2331) @@ -10,7 +10,7 @@ <!-- Add 'index-nutchwax' and 'query-nutchwax' to plugin list. --> <!-- Also, add 'parse-pdf' --> <!-- Remove 'urlfilter-regex' and 'normalizer-(pass|regex|basic)' --> - <value>protocol-http|parse-(text|html|js|pdf)|index-(basic|anchor|nutchwax)|query-(basic|site|url|nutchwax)|summary-basic|scoring-opic</value> + <value>protocol-http|parse-(text|html|js|pdf)|index-(basic|anchor|nutchwax)|query-(basic|site|url|nutchwax)|summary-basic|scoring-opic|urlfilter-nutchwax</value> </property> <property> @@ -24,6 +24,7 @@ --> <name>nutchwax.filter.index</name> <value> + archive-digest:false arcname:false collection date @@ -45,6 +46,7 @@ <!-- We do *not* use this filter for handling "date" queries, there is a specific filter for that: DateQueryFilter --> <name>nutchwax.filter.query</name> <value> + raw:archive-digest:false raw:arcname:false group:collection group:type @@ -62,8 +64,19 @@ <property> <name>mime.type.magic</name> <value>false</value> - <description>Defines if the mime content type detector uses magic resolution. - </description> + <description>Defines if the mime content type detector uses magic resolution.</description> </property> +<property> + <name>nutchwax.urlfilter.wayback.exclusions</name> + <value></value> + <description>Path to file containing list of exclusions.</description> +</property> + +<property> + <name>nutchwax.urlfilter.wayback.canonicalizer</name> + <value>org.archive.wayback.util.url.AggressiveUrlCanonicalizer</value> + <description></description> +</property> + </configuration> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |