From: Brad <bra...@us...> - 2005-10-18 02:31:01
|
Update of /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/arcindexer In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv3483/src/java/org/archive/wayback/arcindexer Added Files: IndexPipeline.java ArcIndexer.java Log Message: Initial check-in -- pre code review --- NEW FILE: ArcIndexer.java --- package org.archive.wayback.arcindexer; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.text.ParseException; import java.util.Iterator; import org.archive.io.arc.ARCReader; import org.archive.io.arc.ARCReaderFactory; import org.archive.io.arc.ARCRecord; import org.archive.io.arc.ARCRecordMetaData; import org.archive.net.UURI; import org.archive.wayback.core.ResourceResult; import org.archive.wayback.core.ResourceResults; import org.archive.wayback.core.Timestamp; import org.apache.commons.httpclient.Header; public class ArcIndexer { private final static String LOCATION_HTTP_HEADER = "Location"; public ArcIndexer() { super(); // TODO Auto-generated constructor stub } public ResourceResults indexArc(final String arcPath) throws IOException { ResourceResults results = new ResourceResults(); File arc = new File(arcPath); ARCReader arcReader = ARCReaderFactory.get(arc); arcReader.setParseHttpHeaders(true); // doh. this does not generate quite the columns we need: //arcReader.createCDXIndexFile(arcPath); Iterator itr = arcReader.iterator(); while (itr.hasNext()) { ARCRecord rec = (ARCRecord) itr.next(); ResourceResult result; try { result = arcRecordToResourceResult(rec, arc); } catch (NullPointerException e) { // TODO Auto-generated catch block e.printStackTrace(); continue; } catch (ParseException e) { // TODO Auto-generated catch block e.printStackTrace(); continue; } results.addResourceResult(result); } return results; } private ResourceResult arcRecordToResourceResult(final ARCRecord rec, File arc) throws NullPointerException, IOException, ParseException { rec.close(); ARCRecordMetaData meta = rec.getMetaData(); ResourceResult result = new ResourceResult(); result.setArcFileName(arc.getName()); result.setCompressedOffset(meta.getOffset()); String statusCode = (meta.getStatusCode() == null) ? "-" : meta .getStatusCode(); result.setHttpResponseCode(statusCode); result.setMd5Fragment(meta.getDigest()); result.setMimeType(meta.getMimetype()); UURI uri = new UURI(meta.getUrl(), false); result.setOrigHost(uri.getHost()); String redirectUrl = "-"; Header[] headers = rec.getHttpHeaders(); if (headers != null) { for (int i = 0; i < headers.length; i++) { if (headers[i].getName().equals(LOCATION_HTTP_HEADER)) { redirectUrl = headers[i].getValue(); break; } } } result.setRedirectUrl(redirectUrl); result.setTimeStamp(Timestamp.parseBefore(meta.getDate())); UURI uriCap = new UURI(meta.getUrl(), false); String searchHost = uriCap.getHostBasename(); String searchPath = uriCap.getEscapedPathQuery(); String indexUrl = searchHost + searchPath; result.setUrl(indexUrl); return result; } public void serializeResults(final ResourceResults results, final String cdxPath) throws IOException { Iterator itr = results.iterator(); File cdx = new File(cdxPath); FileOutputStream output = new FileOutputStream(cdx); output.write((ResourceResult.getCDXHeaderString() + "\n").getBytes()); while (itr.hasNext()) { ResourceResult result = (ResourceResult) itr.next(); output.write((result.toString() + "\n").getBytes()); } } /** * @param args */ public static void main(String[] args) { // TODO Auto-generated method stub ArcIndexer indexer = new ArcIndexer(); String arc = args[0]; String cdx = args[1]; try { ResourceResults results = indexer.indexArc(arc); indexer.serializeResults(results, cdx); } catch (Exception e) { e.printStackTrace(); } } } --- NEW FILE: IndexPipeline.java --- package org.archive.wayback.arcindexer; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Iterator; import org.archive.wayback.core.ResourceResults; import com.sun.org.apache.xml.internal.utils.StringToStringTable; public class IndexPipeline { private File arcDir = null; public File mergeDir = null; private File queuedDir = null; private File toBeIndexedDir = null; private File indexingDir = null; private ArcIndexer indexer = null; public IndexPipeline() { super(); // TODO Auto-generated constructor stub } private void ensureDir(File dir) throws IOException { if (!dir.isDirectory() && !dir.mkdir()) { throw new IOException("FAILED to create " + dir.getAbsolutePath()); } } public void init(final String arcDir, final String mergeDir, final String workDir) throws IOException { this.arcDir = new File(arcDir); this.mergeDir = new File(mergeDir); this.queuedDir = new File(workDir + "/queued"); this.toBeIndexedDir = new File(workDir + "/to-be-indexed"); this.indexingDir = new File(workDir + "/indexing"); ensureDir(new File(workDir)); ensureDir(this.queuedDir); ensureDir(this.toBeIndexedDir); ensureDir(this.indexingDir); indexer = new ArcIndexer(); } private StringToStringTable dirToSTST(File dir) { StringToStringTable hash = new StringToStringTable(); String entries[] = dir.list(); for (int i = 0; i < entries.length; i++) { hash.put(entries[i], "i"); } return hash; } private StringToStringTable getQueuedFiles() { return dirToSTST(this.queuedDir); } private ArrayList getNewArcs() { StringToStringTable queued = getQueuedFiles(); ArrayList newArcs = new ArrayList(); String arcs[] = this.arcDir.list(); for (int i = 0; i < arcs.length; i++) { if (!queued.contains(arcs[i])) { newArcs.add(arcs[i]); } } return newArcs; } private void queueArc(final String newArc) throws IOException { File newQueuedFile = new File(this.queuedDir.getAbsolutePath() + "/" + newArc); File newToBeIndexedFile = new File(this.toBeIndexedDir .getAbsolutePath() + "/" + newArc); newToBeIndexedFile.createNewFile(); newQueuedFile.createNewFile(); } public void queueNewArcs() throws IOException { ArrayList newArcs = getNewArcs(); if (!newArcs.isEmpty()) { Iterator itr = newArcs.iterator(); while (itr.hasNext()) { String newArc = (String) itr.next(); queueArc(newArc); } } } public void indexArcs() throws MalformedURLException, IOException { queueNewArcs(); String toBeIndexed[] = this.toBeIndexedDir.list(); for (int i = 0; i < toBeIndexed.length; i++) { String base = toBeIndexed[i]; File arcFile = new File(this.arcDir.getAbsolutePath().concat( "/" + base)); File tmpFile = new File(this.indexingDir.getAbsolutePath().concat( "/" + base)); File flagFile = new File(this.toBeIndexedDir.getAbsolutePath() .concat("/" + base)); File finalFile = new File(this.mergeDir.getAbsolutePath().concat( "/" + base)); ResourceResults res = indexer.indexArc(arcFile.getAbsolutePath()); indexer.serializeResults(res, tmpFile.getAbsolutePath()); if (!tmpFile.renameTo(finalFile)) { throw new IOException("Unable to move " + tmpFile.getAbsolutePath() + " to " + finalFile.getAbsolutePath()); } if (!flagFile.delete()) { throw new IOException("Unable to delete " + flagFile.getAbsolutePath()); } } } /** * @param args */ public static void main(String[] args) { // TODO Auto-generated method stub } } |