|
From: <bra...@us...> - 2008-11-07 22:38:52
|
Revision: 2639
http://archive-access.svn.sourceforge.net/archive-access/?rev=2639&view=rev
Author: bradtofel
Date: 2008-11-07 22:38:48 +0000 (Fri, 07 Nov 2008)
Log Message:
-----------
FEATURE: added -all option to warc-indexer command line tool, causing the tool to output records for request and metadata records as well as duplicate, capture, and dns records.
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2008-11-07 22:35:24 UTC (rev 2638)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2008-11-07 22:38:48 UTC (rev 2639)
@@ -2,7 +2,7 @@
import java.io.File;
import java.io.IOException;
-//import java.util.logging.Logger;
+import java.util.logging.Logger;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpParser;
@@ -33,14 +33,23 @@
*/
public class WARCRecordToSearchResultAdapter
implements Adapter<WARCRecord,CaptureSearchResult>{
+ private static final Logger LOGGER =
+ Logger.getLogger(WARCRecordToSearchResultAdapter.class.getName());
private final static String DEFAULT_VALUE = "-";
-// private static final Logger LOGGER = Logger.getLogger(
-// WARCRecordToSearchResultAdapter.class.getName());
-
private UrlCanonicalizer canonicalizer = null;
+
+ private boolean processAll = false;
+ public boolean isProcessAll() {
+ return processAll;
+ }
+
+ public void setProcessAll(boolean processAll) {
+ this.processAll = processAll;
+ }
+
public WARCRecordToSearchResultAdapter() {
canonicalizer = new AggressiveUrlCanonicalizer();
}
@@ -75,12 +84,19 @@
return output.toString();
}
- private static String transformHTTPMime(final String input) {
+ private static String escapeSpaces(final String input) {
+ if(input.contains(" ")) {
+ return input.replace(" ", "%20");
+ }
+ return input;
+ }
+
+ private static String transformHTTPMime(String input) {
int semiIdx = input.indexOf(";");
if(semiIdx > 0) {
- return input.substring(0,semiIdx).trim();
+ return escapeSpaces(input.substring(0,semiIdx).trim());
}
- return input.trim();
+ return escapeSpaces(input.trim());
}
private String transformWarcFilename(String readerIdentifier) {
@@ -148,16 +164,21 @@
return result;
}
- private CaptureSearchResult adaptRevisit(ArchiveRecordHeader header, WARCRecord rec)
+ private CaptureSearchResult adaptGeneric(ArchiveRecordHeader header,
+ WARCRecord rec, String mime)
throws IOException {
CaptureSearchResult result = getBlankSearchResult();
result.setCaptureTimestamp(transformDate(header.getDate()));
+ result.setFile(transformWarcFilename(header.getReaderIdentifier()));
+ result.setOffset(header.getOffset());
result.setDigest(transformDigest(header.getHeaderValue(
- WARCRecord.HEADER_KEY_PAYLOAD_DIGEST)));
+ WARCRecord.HEADER_KEY_PAYLOAD_DIGEST)));
addUrlDataToSearchResult(result,header.getUrl());
+
+ result.setMimeType(mime);
return result;
}
@@ -243,7 +264,7 @@
}
return result;
}
-
+
private CaptureSearchResult adaptInner(WARCRecord rec) throws IOException {
CaptureSearchResult result = null;
@@ -257,7 +278,17 @@
result = adaptResponse(header,rec);
}
} else if(type.equals(WARCConstants.REVISIT)) {
- result = adaptRevisit(header,rec);
+ result = adaptGeneric(header,rec,"warc/revisit");
+ } else if(type.equals(WARCConstants.REQUEST)) {
+ if(processAll) {
+ result = adaptGeneric(header,rec,"warc/request");
+ }
+ } else if(type.equals(WARCConstants.METADATA)) {
+ if(processAll) {
+ result = adaptGeneric(header,rec,"warc/metadata");
+ }
+ } else {
+ LOGGER.info("Skipping record type : " + type);
}
return result;
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java 2008-11-07 22:35:24 UTC (rev 2638)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java 2008-11-07 22:38:48 UTC (rev 2639)
@@ -26,9 +26,19 @@
public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g";
private UrlCanonicalizer canonicalizer = null;
+ private boolean processAll = false;
public WarcIndexer() {
canonicalizer = new AggressiveUrlCanonicalizer();
}
+
+ public boolean isProcessAll() {
+ return processAll;
+ }
+
+ public void setProcessAll(boolean processAll) {
+ this.processAll = processAll;
+ }
+
/**
* @param warc
@@ -61,6 +71,7 @@
WARCRecordToSearchResultAdapter adapter2 =
new WARCRecordToSearchResultAdapter();
adapter2.setCanonicalizer(canonicalizer);
+ adapter2.setProcessAll(processAll);
ArchiveReaderCloseableIterator itr1 =
new ArchiveReaderCloseableIterator(reader,reader.iterator());
@@ -82,11 +93,12 @@
private static void USAGE() {
System.err.println("USAGE:");
System.err.println("");
- System.err.println("warc-indexer [-identity] WARCFILE");
- System.err.println("warc-indexer [-identity] WARCFILE CDXFILE");
+ System.err.println("warc-indexer [-identity] [-all] WARCFILE");
+ System.err.println("warc-indexer [-identity] [-all] WARCFILE CDXFILE");
System.err.println("");
System.err.println("Create a CDX format index at CDXFILE or to STDOUT");
System.err.println("With -identity, perform no url canonicalization.");
+ System.err.println("With -all, output request and metadata records.");
System.exit(1);
}
@@ -96,8 +108,14 @@
public static void main(String[] args) {
WarcIndexer indexer = new WarcIndexer();
int idx = 0;
- if(args[0] != null && args[0].equals("-identity")) {
- indexer.setCanonicalizer(new IdentityUrlCanonicalizer());
+ while(args[idx] != null) {
+ if(args[idx].equals("-identity")) {
+ indexer.setCanonicalizer(new IdentityUrlCanonicalizer());
+ } else if(args[idx].equals("-all")) {
+ indexer.setProcessAll(true);
+ } else {
+ break;
+ }
idx++;
}
File arc = new File(args[idx]);
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bra...@us...> - 2009-11-06 01:49:43
|
Revision: 2885
http://archive-access.svn.sourceforge.net/archive-access/?rev=2885&view=rev
Author: bradtofel
Date: 2009-11-06 01:49:32 +0000 (Fri, 06 Nov 2009)
Log Message:
-----------
REFACTOR: moved main() from ArcIndexer and WarcIndexer into IndexWorker
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java 2009-11-06 01:42:28 UTC (rev 2884)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java 2009-11-06 01:49:32 UTC (rev 2885)
@@ -25,9 +25,7 @@
package org.archive.wayback.resourcestore.indexer;
import java.io.File;
-import java.io.PrintWriter;
import java.io.IOException;
-import java.util.Iterator;
import org.archive.io.ArchiveRecord;
import org.archive.io.arc.ARCReader;
@@ -35,12 +33,10 @@
import org.archive.io.arc.ARCRecord;
import org.archive.wayback.UrlCanonicalizer;
import org.archive.wayback.core.CaptureSearchResult;
-import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter;
import org.archive.wayback.util.AdaptedIterator;
import org.archive.wayback.util.Adapter;
import org.archive.wayback.util.CloseableIterator;
import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
-import org.archive.wayback.util.url.IdentityUrlCanonicalizer;
/**
* Transforms an ARC file into Iterator<CaptureSearchResult>.
@@ -50,10 +46,6 @@
*/
public class ArcIndexer {
- /**
- * CDX Header line for these fields. not very configurable..
- */
- public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g";
private UrlCanonicalizer canonicalizer = null;
public ArcIndexer() {
@@ -113,51 +105,6 @@
this.canonicalizer = canonicalizer;
}
- private static void USAGE() {
- System.err.println("USAGE:");
- System.err.println("");
- System.err.println("arc-indexer [-identity] ARCFILE");
- System.err.println("arc-indexer [-identity] ARCFILE CDXFILE");
- System.err.println("");
- System.err.println("Create a CDX format index at CDXFILE or to STDOUT.");
- System.err.println("With -identity, perform no url canonicalization.");
- System.exit(1);
- }
-
- /**
- * @param args
- */
- public static void main(String[] args) {
- ArcIndexer indexer = new ArcIndexer();
- int idx = 0;
- if(args[0] != null && args[0].equals("-identity")) {
- indexer.setCanonicalizer(new IdentityUrlCanonicalizer());
- idx++;
- }
- File arc = new File(args[idx]);
- idx++;
- PrintWriter pw = null;
- try {
- if(args.length == idx) {
- // dump to STDOUT:
- pw = new PrintWriter(System.out);
- } else if(args.length == (idx + 1)) {
- pw = new PrintWriter(args[idx]);
- } else {
- USAGE();
- }
- Iterator<CaptureSearchResult> res = indexer.iterator(arc);
- Iterator<String> lines = SearchResultToCDXLineAdapter.adapt(res);
- while(lines.hasNext()) {
- pw.println(lines.next());
- }
- pw.close();
- } catch (Exception e) {
- e.printStackTrace();
- System.exit(1);
- }
- }
-
private class ArchiveRecordToARCRecordAdapter
implements Adapter<ArchiveRecord,ARCRecord> {
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java 2009-11-06 01:42:28 UTC (rev 2884)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java 2009-11-06 01:49:32 UTC (rev 2885)
@@ -24,16 +24,23 @@
*/
package org.archive.wayback.resourcestore.indexer;
+import java.io.FileNotFoundException;
import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.Iterator;
import java.util.logging.Logger;
import org.archive.wayback.Shutdownable;
import org.archive.wayback.UrlCanonicalizer;
import org.archive.wayback.core.CaptureSearchResult;
+import org.archive.wayback.resourceindex.cdx.CDXFormatIndex;
+import org.archive.wayback.resourceindex.cdx.SearchResultToCDXFormatAdapter;
+import org.archive.wayback.resourceindex.cdx.format.CDXFormat;
+import org.archive.wayback.resourceindex.cdx.format.CDXFormatException;
import org.archive.wayback.resourceindex.updater.IndexClient;
import org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDB;
import org.archive.wayback.util.CloseableIterator;
-//import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
+import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
import org.archive.wayback.util.url.IdentityUrlCanonicalizer;
/**
@@ -112,6 +119,7 @@
}
} catch(IOException e) {
LOGGER.severe("FAILED to index or upload (" + name + ")");
+ e.printStackTrace();
}
}
return worked;
@@ -133,7 +141,86 @@
}
return itr;
}
+
+ private static void USAGE() {
+ System.err.println("USAGE:");
+ System.err.println("");
+ System.err.println("cdx-indexer [-format FORMAT|-identity] FILE");
+ System.err.println("cdx-indexer [-format FORMAT|-identity] FILE CDXFILE");
+ System.err.println("");
+ System.err.println("Create a CDX format index from ARC or WARC file");
+ System.err.println("FILE at CDXFILE or to STDOUT.");
+ System.err.println("With -identity, perform no url canonicalization.");
+ System.err.println("With -format, output CDX in format FORMAT.");
+ System.exit(1);
+ }
+ /**
+ * @param args
+ */
+ public static void main(String[] args) {
+ String cdxSpec = CDXFormatIndex.CDX_HEADER_MAGIC;
+ PrintWriter pw = new PrintWriter(System.out);
+ UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer();
+ boolean setFormat = false;
+ boolean isIdentity = false;
+ String path = null;
+ for(int idx = 0; idx < args.length; idx++) {
+ if(args[idx].equals("-identity")) {
+ canonicalizer = new IdentityUrlCanonicalizer();
+ isIdentity = true;
+ } else if(args[idx].equals("-format")) {
+ idx++;
+ if(idx >= args.length) {
+ USAGE();
+ }
+ cdxSpec = args[idx];
+ setFormat = true;
+ } else {
+ // either input filename:
+ if(path == null) {
+ path = args[idx];
+ } else {
+ // or if that's already been specified, then target file:
+ if(idx+1 != args.length){
+ USAGE();
+ }
+ try {
+ pw = new PrintWriter(args[idx]);
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ System.exit(1);
+ }
+ break;
+ }
+ }
+ }
+ if(!setFormat && isIdentity) {
+ cdxSpec = cdxSpec.replace(" N ", " a ");
+ }
+ IndexWorker worker = new IndexWorker();
+ worker.canonicalizer = canonicalizer;
+ worker.interval = 0;
+ worker.init();
+ try {
+ CloseableIterator<CaptureSearchResult> itr = worker.indexFile(path);
+ CDXFormat cdxFormat = new CDXFormat(cdxSpec);
+ Iterator<String> lines =
+ SearchResultToCDXFormatAdapter.adapt(itr, cdxFormat);
+ pw.println(cdxSpec);
+ while(lines.hasNext()) {
+ pw.println(lines.next());
+ }
+ pw.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ System.exit(1);
+ } catch (CDXFormatException e) {
+ e.printStackTrace();
+ System.exit(1);
+ }
+
+ }
private class WorkerThread extends Thread {
private long runInterval = 120000;
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java 2009-11-06 01:42:28 UTC (rev 2884)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java 2009-11-06 01:49:32 UTC (rev 2885)
@@ -2,8 +2,6 @@
import java.io.File;
import java.io.IOException;
-import java.io.PrintWriter;
-import java.util.Iterator;
import org.archive.io.ArchiveRecord;
import org.archive.io.warc.WARCReader;
@@ -11,20 +9,13 @@
import org.archive.io.warc.WARCRecord;
import org.archive.wayback.UrlCanonicalizer;
import org.archive.wayback.core.CaptureSearchResult;
-import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter;
import org.archive.wayback.util.AdaptedIterator;
import org.archive.wayback.util.Adapter;
import org.archive.wayback.util.CloseableIterator;
import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
-import org.archive.wayback.util.url.IdentityUrlCanonicalizer;
public class WarcIndexer {
- /**
- * CDX Header line for these fields. not very configurable..
- */
- public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g";
-
private UrlCanonicalizer canonicalizer = null;
private boolean processAll = false;
public WarcIndexer() {
@@ -89,60 +80,7 @@
public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
this.canonicalizer = canonicalizer;
}
-
- private static void USAGE() {
- System.err.println("USAGE:");
- System.err.println("");
- System.err.println("warc-indexer [-identity] [-all] WARCFILE");
- System.err.println("warc-indexer [-identity] [-all] WARCFILE CDXFILE");
- System.err.println("");
- System.err.println("Create a CDX format index at CDXFILE or to STDOUT");
- System.err.println("With -identity, perform no url canonicalization.");
- System.err.println("With -all, output request and metadata records.");
- System.exit(1);
- }
- /**
- * @param args
- */
- public static void main(String[] args) {
- WarcIndexer indexer = new WarcIndexer();
- int idx = 0;
- while(args[idx] != null) {
- if(args[idx].equals("-identity")) {
- indexer.setCanonicalizer(new IdentityUrlCanonicalizer());
- } else if(args[idx].equals("-all")) {
- indexer.setProcessAll(true);
- } else {
- break;
- }
- idx++;
- }
- File arc = new File(args[idx]);
- idx++;
- PrintWriter pw = null;
- try {
- if (args.length == idx) {
- // dump to STDOUT:
- pw = new PrintWriter(System.out);
- } else if (args.length == (idx+1)) {
- pw = new PrintWriter(args[1]);
- } else {
- USAGE();
- }
- Iterator<CaptureSearchResult> res = indexer.iterator(arc);
- Iterator<String> lines = SearchResultToCDXLineAdapter.adapt(res);
- while (lines.hasNext()) {
- pw.println(lines.next());
- }
- pw.close();
-
- } catch (Exception e) {
- e.printStackTrace();
- System.exit(1);
- }
- }
-
private class ArchiveRecordToWARCRecordAdapter implements
Adapter<ArchiveRecord, WARCRecord> {
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bra...@us...> - 2009-11-06 01:53:33
|
Revision: 2887
http://archive-access.svn.sourceforge.net/archive-access/?rev=2887&view=rev
Author: bradtofel
Date: 2009-11-06 01:53:23 +0000 (Fri, 06 Nov 2009)
Log Message:
-----------
REFACTOR: Moved common HTTP header parsing code into HTTPRecordAnnotater
FEATURE: HTML content is now parsed using the SAX parser, to search for META robots tags
FEATURE: Now HTTP headers are inspected for Robot related instructions
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java
Added Paths:
-----------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaFlags.java
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaRule.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java 2009-11-06 01:50:20 UTC (rev 2886)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java 2009-11-06 01:53:23 UTC (rev 2887)
@@ -36,7 +36,6 @@
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.util.Adapter;
import org.archive.wayback.util.url.IdentityUrlCanonicalizer;
-import org.archive.wayback.util.url.UrlOperations;
/**
*
@@ -50,13 +49,14 @@
// private static final Logger LOGGER = Logger.getLogger(
// ARCRecordToSearchResultAdapter.class.getName());
+ private HTTPRecordAnnotater annotater = null;
private UrlCanonicalizer canonicalizer = null;
public ARCRecordToSearchResultAdapter() {
canonicalizer = new IdentityUrlCanonicalizer();
+ annotater = new HTTPRecordAnnotater();
}
-// public static SearchResult arcRecordToSearchResult(final ARCRecord rec)
-// throws IOException, ParseException {
+
/* (non-Javadoc)
* @see org.archive.wayback.util.Adapter#adapt(java.lang.Object)
*/
@@ -68,7 +68,7 @@
return null;
}
}
-
+
private CaptureSearchResult adaptInner(ARCRecord rec) throws IOException {
rec.close();
ARCRecordMetaData meta = rec.getMetaData();
@@ -84,12 +84,14 @@
// initialize with default HTTP code...
result.setHttpCode("-");
+ result.setRedirectUrl("-");
result.setDigest(rec.getDigestStr());
- result.setMimeType(meta.getMimetype());
result.setCaptureTimestamp(meta.getDate());
-
String uriStr = meta.getUrl();
+ result.setOriginalUrl(uriStr);
+
+
if (uriStr.startsWith(ARCRecord.ARC_MAGIC_NUMBER)) {
// skip filedesc record altogether...
return null;
@@ -97,49 +99,20 @@
if (uriStr.startsWith(WaybackConstants.DNS_URL_PREFIX)) {
// skip URL + HTTP header processing for dns records...
- result.setOriginalUrl(uriStr);
- result.setRedirectUrl("-");
result.setUrlKey(uriStr);
-
+ result.setMimeType("text/dns");
+ result.setEndOffset(rec.compressedBytes);
+
} else {
- result.setOriginalUrl(uriStr);
+ result.setUrlKey(canonicalizer.urlStringToKey(uriStr));
-
String statusCode = (meta.getStatusCode() == null) ? "-" : meta
.getStatusCode();
result.setHttpCode(statusCode);
- String redirectUrl = "-";
Header[] headers = rec.getHttpHeaders();
- if (headers != null) {
-
- for (int i = 0; i < headers.length; i++) {
- if (headers[i].getName().equals(
- WaybackConstants.LOCATION_HTTP_HEADER)) {
-
- String locationStr = headers[i].getValue();
- // TODO: "Location" is supposed to be absolute:
- // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html)
- // (section 14.30) but Content-Location can be
- // relative.
- // is it correct to resolve a relative Location, as
- // we are?
- // it's also possible to have both in the HTTP
- // headers...
- // should we prefer one over the other?
- // right now, we're ignoring "Content-Location"
- redirectUrl = UrlOperations.resolveUrl(uriStr,
- locationStr);
-
- break;
- }
- }
- result.setRedirectUrl(redirectUrl);
-
- String urlKey = canonicalizer.urlStringToKey(meta.getUrl());
- result.setUrlKey(urlKey);
- }
+ annotater.annotateHTTPContent(result, rec, headers, meta.getMimetype());
}
return result;
}
@@ -149,4 +122,18 @@
public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
this.canonicalizer = canonicalizer;
}
+
+ /**
+ * @return the annotater
+ */
+ public HTTPRecordAnnotater getAnnotater() {
+ return annotater;
+ }
+
+ /**
+ * @param annotater the annotater to set
+ */
+ public void setAnnotater(HTTPRecordAnnotater annotater) {
+ this.annotater = annotater;
+ }
}
Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java 2009-11-06 01:53:23 UTC (rev 2887)
@@ -0,0 +1,144 @@
+package org.archive.wayback.resourcestore.indexer;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.logging.Logger;
+
+import org.apache.commons.httpclient.Header;
+import org.archive.wayback.WaybackConstants;
+import org.archive.wayback.core.CaptureSearchResult;
+import org.archive.wayback.util.htmllex.ContextAwareLexer;
+import org.archive.wayback.util.htmllex.ParseEventDelegator;
+import org.archive.wayback.util.htmllex.ParseContext;
+import org.archive.wayback.util.url.UrlOperations;
+import org.htmlparser.Node;
+import org.htmlparser.lexer.Lexer;
+import org.htmlparser.lexer.Page;
+import org.htmlparser.util.ParserException;
+
+public class HTTPRecordAnnotater {
+ private RobotMetaRule rule = null;
+ private ParseEventDelegator rules = null;
+ private RobotMetaFlags robotFlags;
+ private static final Logger LOGGER =
+ Logger.getLogger(HTTPRecordAnnotater.class.getName());
+
+ private final static String[] mimes = {
+ "html"
+ };
+ public HTTPRecordAnnotater() {
+ rules = new ParseEventDelegator();
+ rules.init();
+ rule = new RobotMetaRule();
+ robotFlags = new RobotMetaFlags();
+ rule.setRobotFlags(robotFlags);
+ rule.visit(rules);
+ }
+ public boolean isHTML(String mimeType) {
+ String mimeLower = mimeType.toLowerCase();
+ for(String mime : mimes) {
+ if(mimeLower.contains(mime)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private String escapeSpaces(final String input) {
+ if(input.contains(" ")) {
+ return input.replace(" ", "%20");
+ }
+ return input;
+ }
+
+ public String transformHTTPMime(String input) {
+ int semiIdx = input.indexOf(";");
+ if(semiIdx > 0) {
+ return escapeSpaces(input.substring(0,semiIdx).trim());
+ }
+ return escapeSpaces(input.trim());
+ }
+
+ public void annotateHTTPContent(CaptureSearchResult result,
+ InputStream is, Header[] headers, String mimeGuess) {
+ robotFlags.reset();
+ String mimeType = null;
+ if (headers != null) {
+
+ for (Header httpHeader : headers) {
+ if (httpHeader.getName().equals(
+ WaybackConstants.LOCATION_HTTP_HEADER)) {
+
+ String locationStr = httpHeader.getValue();
+ // TODO: "Location" is supposed to be absolute:
+ // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html)
+ // (section 14.30) but Content-Location can be
+ // relative.
+ // is it correct to resolve a relative Location, as
+ // we are?
+ // it's also possible to have both in the HTTP
+ // headers...
+ // should we prefer one over the other?
+ // right now, we're ignoring "Content-Location"
+ result.setRedirectUrl(
+ UrlOperations.resolveUrl(result.getOriginalUrl(),
+ locationStr));
+
+ } else if(httpHeader.getName().toLowerCase().equals("content-type")) {
+ mimeType = transformHTTPMime(httpHeader.getValue());
+ } else if(httpHeader.getName().toLowerCase().equals(
+ WaybackConstants.X_ROBOTS_HTTP_HEADER)) {
+
+ robotFlags.parse(httpHeader.getValue());
+ }
+ }
+ }
+
+ // TODO: get the encoding:
+ String encoding = "utf-8";
+ if(mimeType == null) {
+ // nothing present in the HTTP headers.. Use the WARC field:
+ mimeType = transformHTTPMime(mimeGuess);
+ }
+ result.setMimeType(mimeType);
+ // Now the sticky part: If it looks like an HTML document, look for
+ // robot meta tags:
+ if(isHTML(mimeType)) {
+ String fileContext = result.getFile() + ":" + result.getOffset();
+ annotateHTMLContent(is, encoding, fileContext, result);
+ }
+ robotFlags.apply(result);
+
+ }
+
+ public void annotateHTMLContent(InputStream is, String charSet, String fileContext,
+ CaptureSearchResult result) {
+
+ ParseContext context = new ParseContext();
+
+ Node node;
+ try {
+ ContextAwareLexer lex = new ContextAwareLexer(
+ new Lexer(new Page(is,charSet)),context);
+ while((node = lex.nextNode()) != null) {
+// System.err.println("\nDEBUG-Node:js("+context.isInJS()+")css("+context.isInCSS()+"):");
+// System.err.println("-------------------/START");
+// System.err.println(node.toHtml(true));
+// System.err.println("-------------------/END");
+ rules.handleNode(context, node);
+ }
+ rules.handleParseComplete(context);
+ } catch (ParserException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ LOGGER.warning(fileContext + " " + e.getLocalizedMessage());
+ } catch (UnsupportedEncodingException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ LOGGER.warning(fileContext + " " + e.getLocalizedMessage());
+ } catch (IOException e) {
+ LOGGER.warning(fileContext + " " + e.getLocalizedMessage());
+ }
+ }
+}
Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java
___________________________________________________________________
Added: svn:keywords
+ Author Date Revision Id
Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaFlags.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaFlags.java (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaFlags.java 2009-11-06 01:53:23 UTC (rev 2887)
@@ -0,0 +1,44 @@
+package org.archive.wayback.resourcestore.indexer;
+
+import org.archive.wayback.core.CaptureSearchResult;
+
+public class RobotMetaFlags {
+ private static String NO_NOTHIN_MATCH = "NONE";
+ private static String NO_FOLLOW_MATCH = "NOFOLLOW";
+ private static String NO_INDEX_MATCH = "NOINDEX";
+ private static String NO_ARCHIVE_MATCH = "NOARCHIVE";
+
+ private boolean noArchive = false;
+ private boolean noIndex = false;
+ private boolean noFollow = false;
+ public void reset() {
+ noArchive = false;
+ noIndex = false;
+ noFollow = false;
+ }
+ public void parse(String content) {
+ if(content == null) {
+ return;
+ }
+ String up = content.replaceAll("-", "").toUpperCase();
+ if(up.contains(NO_FOLLOW_MATCH)) {
+ noFollow = true;
+ }
+ if(up.contains(NO_ARCHIVE_MATCH)) {
+ noArchive = true;
+ }
+ if(up.contains(NO_INDEX_MATCH)) {
+ noIndex = true;
+ }
+ if(up.contains(NO_NOTHIN_MATCH)) {
+ noFollow = true;
+ noArchive = true;
+ noIndex = true;
+ }
+ }
+ public void apply(CaptureSearchResult result) {
+ if(noFollow) result.setRobotNoFollow();
+ if(noIndex) result.setRobotNoIndex();
+ if(noArchive) result.setRobotNoArchive();
+ }
+}
Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaFlags.java
___________________________________________________________________
Added: svn:keywords
+ Author Date Revision Id
Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaRule.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaRule.java (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaRule.java 2009-11-06 01:53:23 UTC (rev 2887)
@@ -0,0 +1,47 @@
+package org.archive.wayback.resourcestore.indexer;
+
+import java.io.IOException;
+
+import org.archive.wayback.util.htmllex.ParseEventDelegator;
+import org.archive.wayback.util.htmllex.ParseEventDelegatorVisitor;
+import org.archive.wayback.util.htmllex.ParseContext;
+import org.archive.wayback.util.htmllex.handlers.OpenTagHandler;
+import org.htmlparser.nodes.TagNode;
+
+public class RobotMetaRule implements ParseEventDelegatorVisitor, OpenTagHandler {
+
+ private RobotMetaFlags robotFlags = null;
+
+ public void visit(ParseEventDelegator rules) {
+ // register for <META> Start tags:
+ rules.addOpenTagHandler(this, "META");
+ }
+
+ public void handleOpenTagNode(ParseContext context, TagNode node)
+ throws IOException {
+ String nameVal = node.getAttribute("name");
+ if(nameVal != null) {
+ if(nameVal.toUpperCase().equals("ROBOTS")) {
+ String content = node.getAttribute("content");
+ if(content != null) {
+ robotFlags.parse(content);
+ }
+ }
+ }
+ }
+
+ /**
+ * @return the robotFlags
+ */
+ public RobotMetaFlags getRobotFlags() {
+ return robotFlags;
+ }
+
+ /**
+ * @param robotFlags the robotFlags to set
+ */
+ public void setRobotFlags(RobotMetaFlags robotFlags) {
+ this.robotFlags = robotFlags;
+ }
+
+}
Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaRule.java
___________________________________________________________________
Added: svn:keywords
+ Author Date Revision Id
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2009-11-06 01:50:20 UTC (rev 2886)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2009-11-06 01:53:23 UTC (rev 2887)
@@ -2,23 +2,23 @@
import java.io.File;
import java.io.IOException;
-import java.util.logging.Logger;
+//import java.util.logging.Logger;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpParser;
import org.apache.commons.httpclient.StatusLine;
+import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.util.EncodingUtil;
+import org.apache.log4j.Logger;
import org.archive.io.ArchiveRecordHeader;
import org.archive.io.RecoverableIOException;
import org.archive.io.arc.ARCConstants;
import org.archive.io.warc.WARCConstants;
import org.archive.io.warc.WARCRecord;
import org.archive.wayback.UrlCanonicalizer;
-import org.archive.wayback.WaybackConstants;
import org.archive.wayback.core.CaptureSearchResult;
import org.archive.wayback.util.Adapter;
-import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
-import org.archive.wayback.util.url.UrlOperations;
+import org.archive.wayback.util.url.IdentityUrlCanonicalizer;
/**
* Adapts certain WARCRecords into SearchResults. DNS and response records are
@@ -33,29 +33,23 @@
*/
public class WARCRecordToSearchResultAdapter
implements Adapter<WARCRecord,CaptureSearchResult>{
+
private static final Logger LOGGER =
Logger.getLogger(WARCRecordToSearchResultAdapter.class.getName());
private final static String DEFAULT_VALUE = "-";
-
private UrlCanonicalizer canonicalizer = null;
+ private HTTPRecordAnnotater annotater = null;
private boolean processAll = false;
- public boolean isProcessAll() {
- return processAll;
- }
-
- public void setProcessAll(boolean processAll) {
- this.processAll = processAll;
- }
-
public WARCRecordToSearchResultAdapter() {
- canonicalizer = new AggressiveUrlCanonicalizer();
+ canonicalizer = new IdentityUrlCanonicalizer();
+ annotater = new HTTPRecordAnnotater();
}
- /* (non-Javadoc)
- * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object)
+ /*
+ * This just calls adaptInner, returning null if an Exception is thrown:
*/
public CaptureSearchResult adapt(WARCRecord rec) {
try {
@@ -65,121 +59,94 @@
return null;
}
}
-
- /*
- * Transform input date to 14-digit timestamp:
- * 2007-08-29T18:00:26Z => 20070829180026
- */
- private static String transformDate(final String input) {
+
+ private CaptureSearchResult adaptInner(WARCRecord rec) throws IOException {
- StringBuilder output = new StringBuilder(14);
-
- output.append(input.substring(0,4));
- output.append(input.substring(5,7));
- output.append(input.substring(8,10));
- output.append(input.substring(11,13));
- output.append(input.substring(14,16));
- output.append(input.substring(17,19));
-
- return output.toString();
- }
-
- private static String escapeSpaces(final String input) {
- if(input.contains(" ")) {
- return input.replace(" ", "%20");
- }
- return input;
- }
-
- private static String transformHTTPMime(String input) {
- int semiIdx = input.indexOf(";");
- if(semiIdx > 0) {
- return escapeSpaces(input.substring(0,semiIdx).trim());
- }
- return escapeSpaces(input.trim());
- }
+ ArchiveRecordHeader header = rec.getHeader();
- private String transformWarcFilename(String readerIdentifier) {
- String warcName = readerIdentifier;
- int index = warcName.lastIndexOf(File.separator);
- if (index > 0 && (index + 1) < warcName.length()) {
- warcName = warcName.substring(index + 1);
+ String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
+ if(type.equals(WARCConstants.WARCINFO)) {
+ LOGGER.info("Skipping record type : " + type);
+ return null;
}
- return warcName;
- }
- private String transformDigest(final Object o) {
- if(o == null) {
- return DEFAULT_VALUE;
+ CaptureSearchResult result = genericResult(rec);
+
+ if(type.equals(WARCConstants.RESPONSE)) {
+ String mime = annotater.transformHTTPMime(header.getMimetype());
+ if(mime.equals("text/dns")) {
+ // close to complete reading, then the digest is legit
+ // TODO: DO we want to use the WARC header digest for this?
+ rec.close();
+ result.setDigest(transformWARCDigest(rec.getDigestStr()));
+ result.setMimeType(mime);
+ } else {
+ result = adaptWARCHTTPResponse(result,rec);
+ }
+ } else if(type.equals(WARCConstants.REVISIT)) {
+ // also set the mime type:
+ result.setMimeType("warc/revisit");
+
+ } else if(type.equals(WARCConstants.REQUEST)) {
+
+ if(processAll) {
+ // also set the mime type:
+ result.setMimeType("warc/request");
+ } else {
+ result = null;
+ }
+ } else if(type.equals(WARCConstants.METADATA)) {
+
+ if(processAll) {
+ // also set the mime type:
+ result.setMimeType("warc/metadata");
+ } else {
+ result = null;
+ }
+ } else {
+ LOGGER.info("Skipping record type : " + type);
}
- String orig = o.toString();
- if(orig.startsWith("sha1:")) {
- return orig.substring(5);
- }
- return orig;
+
+ return result;
}
- private CaptureSearchResult getBlankSearchResult() {
+ // ALL HELPER METHODS BELOW:
+
+ /*
+ * Extract all common WARC fields into a CaptureSearchResult. This is the
+ * same for all WARC record types:
+ *
+ * file, offset, timestamp, digest, urlKey, originalUrl
+ */
+ private CaptureSearchResult genericResult(WARCRecord rec) {
+
CaptureSearchResult result = new CaptureSearchResult();
- result.setUrlKey(DEFAULT_VALUE);
- result.setOriginalUrl(DEFAULT_VALUE);
- result.setCaptureTimestamp(DEFAULT_VALUE);
- result.setDigest(DEFAULT_VALUE);
result.setMimeType(DEFAULT_VALUE);
result.setHttpCode(DEFAULT_VALUE);
result.setRedirectUrl(DEFAULT_VALUE);
- result.setFile(DEFAULT_VALUE);
- result.setOffset(0);
- return result;
- }
-
- private void addUrlDataToSearchResult(CaptureSearchResult result, String urlStr)
- throws IOException {
- result.setOriginalUrl(urlStr);
- String urlKey = canonicalizer.urlStringToKey(urlStr);
- result.setUrlKey(urlKey);
- }
+ ArchiveRecordHeader header = rec.getHeader();
- private CaptureSearchResult adaptDNS(ArchiveRecordHeader header, WARCRecord rec)
- throws IOException {
-
- CaptureSearchResult result = getBlankSearchResult();
-
- result.setCaptureTimestamp(transformDate(header.getDate()));
- result.setFile(transformWarcFilename(header.getReaderIdentifier()));
- result.setOffset(header.getOffset());
+ String file = transformWARCFilename(header.getReaderIdentifier());
+ long offset = header.getOffset();
- String uriStr = header.getUrl();
-
- result.setMimeType(header.getMimetype());
-
- result.setOriginalUrl(uriStr);
- result.setUrlKey(uriStr);
-
- rec.close();
- result.setDigest(rec.getDigestStr());
-
- return result;
- }
-
- private CaptureSearchResult adaptGeneric(ArchiveRecordHeader header,
- WARCRecord rec, String mime)
- throws IOException {
-
- CaptureSearchResult result = getBlankSearchResult();
-
- result.setCaptureTimestamp(transformDate(header.getDate()));
- result.setFile(transformWarcFilename(header.getReaderIdentifier()));
- result.setOffset(header.getOffset());
- result.setDigest(transformDigest(header.getHeaderValue(
+ result.setCaptureTimestamp(transformWARCDate(header.getDate()));
+ result.setFile(file);
+ result.setOffset(offset);
+ result.setDigest(transformWARCDigest(header.getHeaderValue(
WARCRecord.HEADER_KEY_PAYLOAD_DIGEST)));
- addUrlDataToSearchResult(result,header.getUrl());
-
- result.setMimeType(mime);
-
+ String origUrl = header.getUrl();
+ result.setOriginalUrl(origUrl);
+ try {
+ String urlKey = canonicalizer.urlStringToKey(origUrl);
+ result.setUrlKey(urlKey);
+ } catch (URIException e) {
+ LOGGER.warn("FAILED canonicalize(" + origUrl + "):" +
+ file + " " + offset);
+ result.setUrlKey(origUrl);
+ }
return result;
}
@@ -200,19 +167,55 @@
}
return count;
}
-
- private CaptureSearchResult adaptResponse(ArchiveRecordHeader header, WARCRecord rec)
- throws IOException {
- CaptureSearchResult result = getBlankSearchResult();
+ private String transformWARCFilename(String readerIdentifier) {
+ String warcName = readerIdentifier;
+ int index = warcName.lastIndexOf(File.separator);
+ if (index > 0 && (index + 1) < warcName.length()) {
+ warcName = warcName.substring(index + 1);
+ }
+ return warcName;
+ }
- result.setCaptureTimestamp(transformDate(header.getDate()));
- result.setFile(transformWarcFilename(header.getReaderIdentifier()));
- result.setOffset(header.getOffset());
+ private String transformWARCDigest(final Object o) {
+ if(o == null) {
+ return DEFAULT_VALUE;
+ }
+ String orig = o.toString();
+ if(orig.startsWith("sha1:")) {
+ return orig.substring(5);
+ }
+ return orig;
+ }
+
+ /*
+ * Transform input date to 14-digit timestamp:
+ * 2007-08-29T18:00:26Z => 20070829180026
+ */
+ private static String transformWARCDate(final String input) {
- String origUrl = header.getUrl();
- addUrlDataToSearchResult(result,origUrl);
+ StringBuilder output = new StringBuilder(14);
+
+ output.append(input.substring(0,4));
+ output.append(input.substring(5,7));
+ output.append(input.substring(8,10));
+ output.append(input.substring(11,13));
+ output.append(input.substring(14,16));
+ output.append(input.substring(17,19));
+
+ return output.toString();
+ }
+ /*
+ * Currently the WARCReader doesn't parse HTTP headers. This method parses
+ * them then calls the common ARC/WARC shared record parsing code, which
+ * addresses HTTP headers, and possibly even parses HTML content to look
+ * for Robot Meta tags.
+ */
+ private CaptureSearchResult adaptWARCHTTPResponse(CaptureSearchResult result,
+ WARCRecord rec) throws IOException {
+
+ ArchiveRecordHeader header = rec.getHeader();
// need to parse the documents HTTP message and headers here: WARCReader
// does not implement this... yet..
@@ -234,66 +237,13 @@
Header[] headers = HttpParser.parseHeaders(rec,
ARCConstants.DEFAULT_ENCODING);
- rec.close();
- result.setDigest(transformDigest(header.getHeaderValue(
- WARCRecord.HEADER_KEY_PAYLOAD_DIGEST)));
-
- if (headers != null) {
-
- for (Header httpHeader : headers) {
- if (httpHeader.getName().equals(
- WaybackConstants.LOCATION_HTTP_HEADER)) {
-
- String locationStr = httpHeader.getValue();
- // TODO: "Location" is supposed to be absolute:
- // (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html)
- // (section 14.30) but Content-Location can be
- // relative.
- // is it correct to resolve a relative Location, as
- // we are?
- // it's also possible to have both in the HTTP
- // headers...
- // should we prefer one over the other?
- // right now, we're ignoring "Content-Location"
- result.setRedirectUrl(
- UrlOperations.resolveUrl(origUrl, locationStr));
- } else if(httpHeader.getName().toLowerCase().equals("content-type")) {
- result.setMimeType(transformHTTPMime(httpHeader.getValue()));
- }
- }
- }
- return result;
- }
-
- private CaptureSearchResult adaptInner(WARCRecord rec) throws IOException {
- CaptureSearchResult result = null;
- ArchiveRecordHeader header = rec.getHeader();
- String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
- if(type.equals(WARCConstants.RESPONSE)) {
- String mime = header.getMimetype();
- if(mime.equals("text/dns")) {
- result = adaptDNS(header,rec);
- } else {
- result = adaptResponse(header,rec);
- }
- } else if(type.equals(WARCConstants.REVISIT)) {
- result = adaptGeneric(header,rec,"warc/revisit");
- } else if(type.equals(WARCConstants.REQUEST)) {
- if(processAll) {
- result = adaptGeneric(header,rec,"warc/request");
- }
- } else if(type.equals(WARCConstants.METADATA)) {
- if(processAll) {
- result = adaptGeneric(header,rec,"warc/metadata");
- }
- } else {
- LOGGER.info("Skipping record type : " + type);
- }
+ annotater.annotateHTTPContent(result,rec,headers,header.getMimetype());
return result;
}
+
public UrlCanonicalizer getCanonicalizer() {
return canonicalizer;
}
@@ -301,4 +251,25 @@
public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
this.canonicalizer = canonicalizer;
}
+
+ public boolean isProcessAll() {
+ return processAll;
+ }
+
+ public void setProcessAll(boolean processAll) {
+ this.processAll = processAll;
+ }
+ /**
+ * @return the annotater
+ */
+ public HTTPRecordAnnotater getAnnotater() {
+ return annotater;
+ }
+
+ /**
+ * @param annotater the annotater to set
+ */
+ public void setAnnotater(HTTPRecordAnnotater annotater) {
+ this.annotater = annotater;
+ }
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bra...@us...> - 2009-12-22 05:16:06
|
Revision: 2941
http://archive-access.svn.sourceforge.net/archive-access/?rev=2941&view=rev
Author: bradtofel
Date: 2009-12-22 05:15:56 +0000 (Tue, 22 Dec 2009)
Log Message:
-----------
Sending File not String to ArchiveReaderFactory.get() methods
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java 2009-12-18 00:34:47 UTC (rev 2940)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java 2009-12-22 05:15:56 UTC (rev 2941)
@@ -69,7 +69,12 @@
*/
public CloseableIterator<CaptureSearchResult> iterator(String pathOrUrl)
throws IOException {
- return iterator(ARCReaderFactory.get(pathOrUrl));
+ File f = new File(pathOrUrl);
+ if(f.isFile()) {
+ return iterator(ARCReaderFactory.get(f));
+ } else {
+ return iterator(ARCReaderFactory.get(pathOrUrl));
+ }
}
/**
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java 2009-12-18 00:34:47 UTC (rev 2940)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java 2009-12-22 05:15:56 UTC (rev 2941)
@@ -71,7 +71,12 @@
*/
public CloseableIterator<CaptureSearchResult> iterator(String pathOrUrl)
throws IOException {
- return iterator(WARCReaderFactory.get(pathOrUrl));
+ File f = new File(pathOrUrl);
+ if(f.isFile()) {
+ return iterator(WARCReaderFactory.get(f));
+ } else {
+ return iterator(WARCReaderFactory.get(pathOrUrl));
+ }
}
/**
* @param arc
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|
|
From: <bra...@us...> - 2011-02-06 14:36:52
|
Revision: 3394
http://archive-access.svn.sourceforge.net/archive-access/?rev=3394&view=rev
Author: bradtofel
Date: 2011-02-06 14:36:45 +0000 (Sun, 06 Feb 2011)
Log Message:
-----------
Now include version info in filedesc and warcinfo records
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java 2011-02-06 14:35:50 UTC (rev 3393)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java 2011-02-06 14:36:45 UTC (rev 3394)
@@ -42,7 +42,8 @@
// private static final Logger LOGGER = Logger.getLogger(
// ARCRecordToSearchResultAdapter.class.getName());
-
+ private static final String VERSION = "0.1.0";
+ private static final String ARC_FILEDESC_VERSION = "arc/filedesc" + VERSION;
private HTTPRecordAnnotater annotater = null;
private UrlCanonicalizer canonicalizer = null;
@@ -88,10 +89,8 @@
if (uriStr.startsWith(ARCRecord.ARC_MAGIC_NUMBER)) {
- // skip filedesc record altogether...
- return null;
- }
- if (uriStr.startsWith(WaybackConstants.DNS_URL_PREFIX)) {
+ result.setMimeType(ARC_FILEDESC_VERSION);
+ } else if (uriStr.startsWith(WaybackConstants.DNS_URL_PREFIX)) {
// skip URL + HTTP header processing for dns records...
result.setUrlKey(uriStr);
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2011-02-06 14:35:50 UTC (rev 3393)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2011-02-06 14:36:45 UTC (rev 3394)
@@ -54,6 +54,10 @@
private static final Logger LOGGER =
Logger.getLogger(WARCRecordToSearchResultAdapter.class.getName());
+
+ private static final String VERSION = "0.1.0";
+ private static final String WARC_FILEDESC_VERSION =
+ "warc/warcinfo" + VERSION;
private final static String DEFAULT_VALUE = "-";
private UrlCanonicalizer canonicalizer = null;
@@ -126,7 +130,7 @@
}
} else if(type.equals(WARCConstants.WARCINFO)) {
- result.setMimeType("warc/warcinfo");
+ result.setMimeType(WARC_FILEDESC_VERSION);
} else {
LOGGER.info("Skipping record type : " + type);
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|