From: <bra...@us...> - 2008-06-24 22:55:27
|
Revision: 2305 http://archive-access.svn.sourceforge.net/archive-access/?rev=2305&view=rev Author: bradtofel Date: 2008-06-24 15:55:35 -0700 (Tue, 24 Jun 2008) Log Message: ----------- INITIAL REV: ResourceFile abstraction, including ResourceFileSource interface, which will allow recursive local directories, polling of local and remote HTTP exported directories Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcWarcFilenameFilter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/DirectoryResourceFileSource.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/JspUrlResourceFileSource.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileList.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileLocation.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileSource.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileSourceUpdater.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/UrlLinkExtractor.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcWarcFilenameFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcWarcFilenameFilter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ArcWarcFilenameFilter.java 2008-06-24 22:55:35 UTC (rev 2305) @@ -0,0 +1,50 @@ +/* ArcWarcFilenameFilter + * + * $Id$ + * + * Created on 4:15:56 PM May 29, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.resourcefile; + +import java.io.File; +import java.io.FilenameFilter; + +/** + * FilenameFilter which returns only compressed/uncompressed ARC/WARC files. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class ArcWarcFilenameFilter implements FilenameFilter { + private final static String ARC_SUFFIX = ".arc"; + private final static String ARC_GZ_SUFFIX = ".arc.gz"; + private final static String WARC_SUFFIX = ".warc"; + private final static String WARC_GZ_SUFFIX = ".warc.gz"; + + public boolean accept(File dir, String name) { + return name.endsWith(ARC_SUFFIX) || + name.endsWith(ARC_GZ_SUFFIX) || + name.endsWith(WARC_SUFFIX) || + name.endsWith(WARC_GZ_SUFFIX); + } + +} + Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/DirectoryResourceFileSource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/DirectoryResourceFileSource.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/DirectoryResourceFileSource.java 2008-06-24 22:55:35 UTC (rev 2305) @@ -0,0 +1,144 @@ +/* DirectoryResourceFileSource + * + * $Id$ + * + * Created on 4:00:49 PM May 29, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.resourcefile; + +import java.io.File; +import java.io.FilenameFilter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * Local directory tree holding ARC and WARC files. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class DirectoryResourceFileSource implements ResourceFileSource { + + private static char SEPRTR = '_'; + private String name = null; + private String path = null; + private File root = null; + private FilenameFilter filter = new ArcWarcFilenameFilter(); + private boolean recurse = true; + + /* (non-Javadoc) + * @see org.archive.wayback.resourcestore.resourcefile.ResourceFileSource#getFileList() + */ + public ResourceFileList getResourceFileList() throws IOException { + if(root == null) { + throw new IOException("No prefix set"); + } + ResourceFileList list = new ResourceFileList(); + populateFileList(list,root,recurse); + return list; + } + + /** + * add all files matching this.filter beneath root to list, recursing if + * recurse is set. + * + * @param list + * @param root + * @param recurse + * @throws IOException + */ + private void populateFileList(ResourceFileList list, File root, boolean recurse) + throws IOException { + + File[] files = root.listFiles(); + for(File file : files) { + if(file.isFile() && filter.accept(root, file.getName())) { + ResourceFileLocation location = new ResourceFileLocation( + file.getName(),file.getAbsolutePath()); + list.add(location); + } else if(recurse && file.isDirectory()){ + populateFileList(list, file, recurse); + } + } + } + + public String getBasename(String path) { + int sepIdx = path.lastIndexOf(File.separatorChar); + if(sepIdx != -1) { + return path.substring(sepIdx + 1); + } + return path; + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourcestore.resourcefile.ResourceFileSource#getName() + */ + public String getName() { + if(name != null) { + return name; + } + if(root != null) { + return root.getAbsolutePath().replace(File.separatorChar, SEPRTR); + } + return null; + } + + public void setName(String name) { + this.name = name; + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourcestore.resourcefile.ResourceFileSource#getPrefix() + */ + public String getPrefix() { + return path; + } + public void setPrefix(String path) { + this.path = path; + root = new File(path); + } + + public boolean isRecurse() { + return recurse; + } + + public void setRecurse(boolean recurse) { + this.recurse = recurse; + } + + public FilenameFilter getFilter() { + return filter; + } + + public void setFilter(FilenameFilter filter) { + this.filter = filter; + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourcestore.resourcefile.ResourceFileSource#getSources() + */ + public List<ResourceFileSource> getSources() { + List<ResourceFileSource> sources = new ArrayList<ResourceFileSource>(); + sources.add(this); + return sources; + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/JspUrlResourceFileSource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/JspUrlResourceFileSource.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/JspUrlResourceFileSource.java 2008-06-24 22:55:35 UTC (rev 2305) @@ -0,0 +1,116 @@ +/* JspUrlResourceFileSource + * + * $Id$ + * + * Created on 5:05:53 PM Jun 5, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.resourcefile; + +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URL; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ +public class JspUrlResourceFileSource implements ResourceFileSource { + + private final static char WEB_SEPARATOR_CHAR = '/'; + private final static String LINE_SEPARATOR_STRING = "\n"; + private String name = null; + private String prefix = null; + private String jsp = null; + + /* (non-Javadoc) + * @see org.archive.wayback.resourcestore.resourcefile.ResourceFileSource#getBasename(java.lang.String) + */ + public String getBasename(String path) { + int sepIdx = path.lastIndexOf(WEB_SEPARATOR_CHAR); + if(sepIdx != -1) { + return path.substring(sepIdx + 1); + } + return path; + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourcestore.resourcefile.ResourceFileSource#getFileList() + */ + public ResourceFileList getResourceFileList() throws IOException { + + String url = "http://localhost:8080" + jsp + "?url=" + prefix; + URL u = new URL(url); + InputStream is = u.openStream(); + InputStreamReader isr = new InputStreamReader(is); + StringBuilder sb = new StringBuilder(2000); + int READ_SIZE = 2048; + char cbuf[] = new char[READ_SIZE]; + int amt = 0; + while((amt = isr.read(cbuf, 0, READ_SIZE)) != -1) { + sb.append(new String(cbuf,0,amt)); + } + ResourceFileList list = new ResourceFileList(); + String lines[] = sb.toString().split(LINE_SEPARATOR_STRING); + for(String line : lines) { + ResourceFileLocation location = + ResourceFileLocation.deserializeLine(line); + if(location != null) { + list.add(location); + } else { + throw new IOException("Bad line format(" + line +")"); + } + } + return list; + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourcestore.resourcefile.ResourceFileSource#getName() + */ + public String getName() { + return name; + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourcestore.resourcefile.ResourceFileSource#getPrefix() + */ + public String getPrefix() { + return prefix; + } + + public void setName(String name) { + this.name = name; + } + + public void setPrefix(String prefix) { + this.prefix = prefix; + } + + public String getJsp() { + return jsp; + } + + public void setJsp(String jsp) { + this.jsp = jsp; + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileList.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileList.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileList.java 2008-06-24 22:55:35 UTC (rev 2305) @@ -0,0 +1,119 @@ +/* ResourceFileList + * + * $Id$ + * + * Created on 12:15:53 PM Jun 16, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.resourcefile; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.logging.Logger; + +import org.archive.wayback.util.AdaptedIterator; +import org.archive.wayback.util.Adapter; +import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.flatfile.FlatFile; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ +public class ResourceFileList { + private static final Logger LOGGER = + Logger.getLogger(ResourceFileList.class.getName()); + + private HashMap<String,ResourceFileLocation> files = + new HashMap<String,ResourceFileLocation>(); + public void add(ResourceFileLocation location) { + files.put(location.serializeLine(), location); + } + public void addAll(Iterator<ResourceFileLocation> itr) { + while(itr.hasNext()) { + add(itr.next()); + } + } + + public Iterator<ResourceFileLocation> iterator() { + return files.values().iterator(); + } + + public void store(File target) throws IOException { + FlatFile ff = new FlatFile(target.getAbsolutePath()); + Iterator<String> adapted = + new AdaptedIterator<ResourceFileLocation,String>(iterator(), + new ResourceFileLocationAdapter()); + ff.store(adapted); + } + + public static ResourceFileList load(File source) throws IOException { + ResourceFileList list = new ResourceFileList(); + + FlatFile ff = new FlatFile(source.getAbsolutePath()); + CloseableIterator<String> itr = ff.getSequentialIterator(); + while(itr.hasNext()) { + String line = itr.next(); + ResourceFileLocation location = + ResourceFileLocation.deserializeLine(line); + if(location != null) { + list.add(location); + } else { + LOGGER.warning("Bad parse of line(" + line + ") in (" + + source.getAbsolutePath() + ")"); + } + } + itr.close(); + return list; + } + + public ResourceFileList subtract(ResourceFileList that) { + HashMap<String,ResourceFileLocation> tmp = + new HashMap<String,ResourceFileLocation>(); + Iterator<ResourceFileLocation> thisItr = iterator(); + while(thisItr.hasNext()) { + ResourceFileLocation location = thisItr.next(); + tmp.put(location.serializeLine(), location); + } + + Iterator<ResourceFileLocation> thatItr = that.iterator(); + while(thatItr.hasNext()) { + ResourceFileLocation location = thatItr.next(); + tmp.remove(location.serializeLine()); + } + ResourceFileList sub = new ResourceFileList(); + sub.addAll(tmp.values().iterator()); + return sub; + } + + private class ResourceFileLocationAdapter implements Adapter<ResourceFileLocation,String> { + + /* (non-Javadoc) + * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + */ + public String adapt(ResourceFileLocation o) { + return o.serializeLine(); + } + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileLocation.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileLocation.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileLocation.java 2008-06-24 22:55:35 UTC (rev 2305) @@ -0,0 +1,80 @@ +/* ResourceFileLocation + * + * $Id$ + * + * Created on 12:16:04 PM Jun 16, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.resourcefile; + +/** + * Class encapsulating the name and String location(url/path) of a ResourceFile. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class ResourceFileLocation { + private final static char DELIMETER = '\t'; + private String name = null; + private String url = null; + public ResourceFileLocation(String name, String url) { + this.name = name; + this.url = url; + } + public String serializeLine() { + StringBuilder sb = new StringBuilder(100); + sb.append(name); + sb.append(DELIMETER); + sb.append(url); + return sb.toString(); + } + public static ResourceFileLocation deserializeLine(String line) { + int idx = line.indexOf(DELIMETER); + if(idx > -1) { + return new ResourceFileLocation(line.substring(0,idx), + line.substring(idx+1)); + } + return null; + } + /** + * @return the name + */ + public String getName() { + return name; + } + /** + * @param name the name to set + */ + public void setName(String name) { + this.name = name; + } + /** + * @return the url + */ + public String getUrl() { + return url; + } + /** + * @param url the url to set + */ + public void setUrl(String url) { + this.url = url; + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileSource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileSource.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileSource.java 2008-06-24 22:55:35 UTC (rev 2305) @@ -0,0 +1,41 @@ +/* ResourceFileSource + * + * $Id$ + * + * Created on 3:49:17 PM May 29, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.resourcefile; + +import java.io.IOException; + +/** + * Interface representing the abstract remote or local folder holding ARC/WARC + * files. + * + * @author brad + * @version $Date$, $Revision$ + */ +public interface ResourceFileSource { + public String getName(); + public String getPrefix(); + public String getBasename(String path); + public ResourceFileList getResourceFileList() throws IOException; +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileSourceUpdater.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileSourceUpdater.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/ResourceFileSourceUpdater.java 2008-06-24 22:55:35 UTC (rev 2305) @@ -0,0 +1,162 @@ +/* ResourceFileSourceUpdater + * + * $Id$ + * + * Created on 12:30:38 PM Jun 23, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.resourcefile; + +import java.io.File; +import java.io.IOException; +import java.util.List; +import java.util.logging.Logger; + +import org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDBUpdater; +import org.archive.wayback.util.DirMaker; + +/** + * Class which repeatedly builds a ResourceFileList for a set of + * ResourceFileSource objects, serializing them into files, and dropping them + * into the incoming directory of a ResourceFileLocationDBUpdater. + * + * In the current implementation, this uses only a single thread to scan the + * ResourceFileSource objects, but with larger installations (1000's of + * ResourceFileSources), multiple threads may later be required. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class ResourceFileSourceUpdater { + private static final Logger LOGGER = + Logger.getLogger(ResourceFileSourceUpdater.class.getName()); + private List<ResourceFileSource> sources = null; + + private File target = null; + + + private UpdateThread thread = null; + private long interval = 120000; + + public void init() { + if(interval > 0) { + thread = new UpdateThread(this,interval); + thread.start(); + } + } + + public void shutdown() { + if(thread != null) { + thread.interrupt(); + try { + thread.join(1000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + + private void synchronizeSource(ResourceFileSource source) { + String name = source.getName(); + try { + LOGGER.fine("Synchronizing " + name); + ResourceFileList list = source.getResourceFileList(); + String tmp = name + ResourceFileLocationDBUpdater.TMP_SUFFIX; + File tmpListTarget = new File(target,tmp); + File listTarget = new File(target,name); + list.store(tmpListTarget); + tmpListTarget.renameTo(listTarget); + LOGGER.fine("Synchronized " + name); + } catch (IOException e) { + e.printStackTrace(); + LOGGER.warning("FAILED Synchronize " + name + e.getMessage()); + } + } + + public void synchronizeSources() { + for(ResourceFileSource source : sources) { + synchronizeSource(source); + } + } + + private class UpdateThread extends Thread { + private long runInterval = 120000; + private ResourceFileSourceUpdater updater = null; + + public UpdateThread(ResourceFileSourceUpdater updater, + long runInterval) { + + this.updater = updater; + this.runInterval = runInterval; + } + + public void run() { + LOGGER.info("alive"); + while (true) { + try { + long startSync = System.currentTimeMillis(); + updater.synchronizeSources(); + long endSync = System.currentTimeMillis(); + long syncDuration = endSync - startSync; + long sleepInterval = runInterval - syncDuration; + if(sleepInterval > 0) { + sleep(sleepInterval); + } else { + LOGGER.warning("Last Synchronize took " + syncDuration + + " where interval is " + interval + + ". Not sleeping."); + } + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + } + + public List<ResourceFileSource> getSources() { + return sources; + } + + public void setSources(List<ResourceFileSource> sources) { + this.sources = sources; + } + + public String getTarget() { + return DirMaker.getAbsolutePath(target); + } + + public void setTarget(String target) throws IOException { + this.target = DirMaker.ensureDir(target); + } + + /** + * @return the interval + */ + public long getInterval() { + return interval; + } + + /** + * @param interval the interval to set + */ + public void setInterval(long interval) { + this.interval = interval; + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/UrlLinkExtractor.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/UrlLinkExtractor.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/resourcefile/UrlLinkExtractor.java 2008-06-24 22:55:35 UTC (rev 2305) @@ -0,0 +1,105 @@ +/* UrlLinkExtractor + * + * $Id$ + * + * Created on 4:26:53 PM Jun 5, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourcestore.resourcefile; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ +public class UrlLinkExtractor { + private final static String QUOTED_ATTR_VALUE = "(?:\"[^\">]*\")"; + + private final static String ESC_QUOTED_ATTR_VALUE = "(?:\\\\\"[^>\\\\]*\\\\\")"; + + private final static String APOSED_ATTR_VALUE = "(?:'[^'>]*')"; + + private final static String RAW_ATTR_VALUE = "(?:[^ \\t\\n\\x0B\\f\\r>\"']+)"; + + + private final static String ANY_ATTR_VALUE = QUOTED_ATTR_VALUE + "|" + + APOSED_ATTR_VALUE + "|" + ESC_QUOTED_ATTR_VALUE + "|" + + RAW_ATTR_VALUE; + + private final static String tagName = "a"; + private final static String attrName = "href"; + + private final static String tagPatString = "<\\s*" + tagName + + "\\s+[^>]*\\b" + attrName + + "\\s*=\\s*(" + ANY_ATTR_VALUE + ")(?:\\s|>)?"; + + private final static Pattern pc = Pattern.compile(tagPatString, + Pattern.CASE_INSENSITIVE); + + public static List<String> extractLinks(final String url) throws IOException { + URL u = new URL(url); + InputStream is = u.openStream(); + InputStreamReader isr = new InputStreamReader(is); + StringBuilder sb = new StringBuilder(2000); + int READ_SIZE = 2048; + char cbuf[] = new char[READ_SIZE]; + int amt = 0; + while((amt = isr.read(cbuf, 0, READ_SIZE)) != -1) { + sb.append(new String(cbuf,0,amt)); + } + return extractAnchors(sb); + } + + private static List<String> extractAnchors(final StringBuilder sb) { + + Matcher m = pc.matcher(sb); + + ArrayList<String> anchors = new ArrayList<String>(); + int idx = 0; + while(m.find(idx)) { + anchors.add(trimAttr(m.group(1))); + idx = m.end(1); + } + return anchors; + } + + private static String trimAttr(final String attr) { + int attrLength = attr.length(); + if (attr.charAt(0) == '"') { + return attr.substring(1, attrLength - 1); + } else if (attr.charAt(0) == '\'') { + return attr.substring(1, attrLength - 1); + } else if (attr.charAt(0) == '\\') { + return attr.substring(2, attrLength - 2); + } + return attr; + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |