larm-cvs Mailing List for Lucene Advanced Retrieval Machine (LARM)
Brought to you by:
cmarschner,
otis
You can subscribe to this list here.
2003 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
(31) |
Jul
(25) |
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
---|---|---|---|---|---|---|---|---|---|---|---|---|
2011 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
(1) |
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
From: <ot...@us...> - 2003-07-29 16:01:29
|
Update of /cvsroot/larm/larm In directory sc8-pr-cvs1:/tmp/cvs-serv24630 Added Files: larm.bat larm.sh build.xml Log Message: - Initial checkin. --- NEW FILE: larm.bat --- java -classpath build/classes larm.root.LARM src/config/%1.xml --- NEW FILE: larm.sh --- java -classpath build/classes larm.root.LARM src/config/$1.xml --- NEW FILE: build.xml --- <project name="larm" default="compile" basedir="."> <property file="${basedir}/build.properties"/> <!-- Component local --> <property name="lib.dir" value="./lib"/> <!-- The current version number of this component --> <property name="app.title" value="LARM"/> <property name="app.version" value="V0.0"/> <!-- The base directory for component sources --> <property name="source.home" value="src"/> <!-- The base directory for unit test sources --> <property name="test.home" value="src/test"/> <!-- The base directory for compilation targets --> <property name="build.home" value="build"/> <!-- The base directory for distribution targets --> <property name="dist.home" value="dist"/> <!-- Should Java compilations set the 'debug' compiler option? --> <property name="compile.debug" value="true"/> <!-- Should Java compilations set the 'deprecation' compiler option? --> <property name="compile.deprecation" value="true"/> <!-- Should Java compilations set the 'optimize' compiler option? --> <property name="compile.optimize" value="true"/> <!-- Construct compile classpath --> <path id="base.classpath"> <pathelement location="${build.home}/classes"/> </path> <path id="compile.classpath"> <pathelement location="${build.home}/classes"/> <pathelement location="${junit.jar}"/> </path> <path id="test.classpath"> <pathelement location="${build.home}/classes"/> <pathelement location="${build.home}/tests"/> <pathelement location="${junit.jar}"/> <pathelement location="${xalan.jar}"/> </path> <!-- The root test to execute --> <property name="test.runner" value="junit.swingui.TestRunner"/> <property name="test.entry" value="larm.AllTests"/> <!-- ========== Targets: "Internal" Targets =============================== --> <target name="init" description="Initialize and evaluate conditionals"> <echo message="-------- ${app.title} ${app.version} --------"/> </target> <target name="prepare" depends="init" description="Prepare build directory"> <mkdir dir="${build.home}"/> <mkdir dir="${build.home}/classes"/> <mkdir dir="${build.home}/tests"/> </target> <!-- ========== Targets: "External" Targets =============================== --> <target name="dist" depends="compile,doc" description="Create binary distribution"> <mkdir dir="${dist.home}"/> <mkdir dir="${dist.home}"/> <jar jarfile ="${dist.home}/larm.jar" basedir ="${build.home}/classes" manifest ="${build.home}/conf/MANIFEST.MF"> <metainf dir="${dist.home}"> <include name="LICENSE.txt"/> </metainf> </jar> </target> <!-- ========== Targets: "External" Targets: Clean-up ===================== --> <target name="clean" description="Clean build and distribution directories"> <delete dir="${build.home}"/> <delete dir="${dist.home}"/> </target> <target name="all" depends="clean,compile" description="Clean and compile all components"/> <!-- ========== Targets: "External" Targets: Compilation ================== --> <target name="compile" depends="prepare" description="Compile shareable components"> <javac srcdir ="${source.home}/java" destdir ="${build.home}/classes" debug ="${compile.debug}" source ="1.4" deprecation ="${compile.deprecation}" optimize ="${compile.optimize}"> <classpath refid="compile.classpath"/> </javac> </target> <target name="compile.tests" depends="compile" description="Compile unit test cases"> <javac srcdir ="${test.home}" destdir ="${build.home}/tests" debug ="${compile.debug}" source ="1.4" deprecation ="${compile.deprecation}" optimize ="${compile.optimize}"> <classpath refid="test.classpath"/> </javac> <copy todir="${build.home}/tests" filtering="on"> <fileset dir="${test.home}" excludes="**/*.java"/> </copy> </target> <!-- ========== Targets: "External" Targets: Testing ====================== --> <target name="test" depends="compile.tests" if="test.entry" description="Run all unit test cases"> <java classname="${test.runner}" fork="yes" failonerror="${test.failonerror}"> <jvmarg value="-Djava.protocol.handler.pkgs=${java.protocol.handler.pkgs}"/> <jvmarg value="-Dorg.apache.commons.logging.Log=${httpclient.test.log}"/> <jvmarg value="-Dhttpclient.test.webappContext=${httpclient.test.webappContext}" /> <arg value="${test.entry}"/> <classpath refid="test.classpath"/> </java> </target> <!-- ========== Targets: "External" Targets: Documenation ================= --> <target name="doc" depends="javadoc" description="Create component documentation."> <mkdir dir="${dist.home}"/> <mkdir dir="${dist.home}/docs"/> <copy todir="${dist.home}/docs" filtering="off"> <fileset dir="docs"/> </copy> </target> <target name="javadoc" depends="compile" description="Create component Javadoc documentation"> <mkdir dir="${dist.home}"/> <mkdir dir="${dist.home}/docs"/> <mkdir dir="${dist.home}/docs/api"/> <javadoc sourcepath ="${source.home}/java" destdir ="${dist.home}/docs/api" packagenames ="larm.*" author ="true" protected ="true" version ="true" source="1.4" doctitle ="<h1>${app.title}</h1>" windowtitle ="${app.title} (Version ${app.version})" > </javadoc> </target> </project> |
From: <ot...@us...> - 2003-07-29 15:15:13
|
Update of /cvsroot/larm/larm/src/java/larm/pipes In directory sc8-pr-cvs1:/tmp/cvs-serv24906/src/java/larm/pipes Modified Files: PipelineManager.java Log Message: - Small fix. Index: PipelineManager.java =================================================================== RCS file: /cvsroot/larm/larm/src/java/larm/pipes/PipelineManager.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** PipelineManager.java 29 Jul 2003 15:10:40 -0000 1.1 --- PipelineManager.java 29 Jul 2003 15:15:10 -0000 1.2 *************** *** 1,7 **** - /* - * Created on 30.06.2003 by Administrator - * - * $Id$ - */ package larm.pipes; --- 1,2 ---- *************** *** 21,26 **** * PipelineManager * ! * @author Administrator ! * 30.06.2003 */ public class PipelineManager implements Configurable, Startable --- 16,21 ---- * PipelineManager * ! * @author ! * @version $Id$ */ public class PipelineManager implements Configurable, Startable |
From: <ot...@us...> - 2003-07-29 15:11:44
|
Update of /cvsroot/larm/larm/src/test/larm In directory sc8-pr-cvs1:/tmp/cvs-serv23925/src/test/larm Modified Files: AllTests.java Log Message: - Big bad update, reorganization, etc. Index: AllTests.java =================================================================== RCS file: /cvsroot/larm/larm/src/test/larm/AllTests.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** AllTests.java 24 Jun 2003 17:19:02 -0000 1.1 --- AllTests.java 29 Jul 2003 15:11:41 -0000 1.2 *************** *** 1,35 **** ! /* ! * Created on 23.06.2003 ! * ! */ ! ! package larm; ! ! import junit.framework.Test; ! import junit.framework.TestSuite; ! import larm.config.ConfigurationTest; ! import larm.pipes.PipelineTest; ! ! /** ! * @author cmarschner ! * ! * contains all tests ! */ ! public class AllTests ! { ! ! public static void main(String[] args) ! { ! junit.swingui.TestRunner.run(AllTests.class); ! } ! ! public static Test suite() ! { ! TestSuite suite = new TestSuite("Test for larm"); ! //$JUnit-BEGIN$ ! suite.addTestSuite(ConfigurationTest.class); ! suite.addTestSuite(PipelineTest.class); ! //$JUnit-END$ ! return suite; ! } ! } --- 1,31 ---- ! package larm; ! ! import junit.framework.Test; ! import junit.framework.TestSuite; ! import larm.config.ConfigurationTest; ! import larm.pipes.PipelineTest; ! ! /** ! * Contains all tests. ! * ! * @author ! * @version $Id$ ! */ ! public class AllTests ! { ! ! public static void main(String[] args) ! { ! junit.swingui.TestRunner.run(AllTests.class); ! } ! ! public static Test suite() ! { ! TestSuite suite = new TestSuite("Test for larm"); ! //$JUnit-BEGIN$ ! suite.addTestSuite(ConfigurationTest.class); ! suite.addTestSuite(PipelineTest.class); ! //$JUnit-END$ ! return suite; ! } ! } |
From: <ot...@us...> - 2003-07-29 15:11:44
|
Update of /cvsroot/larm/larm/src/test/larm/config In directory sc8-pr-cvs1:/tmp/cvs-serv23925/src/test/larm/config Modified Files: ConfigurationTest.java Log Message: - Big bad update, reorganization, etc. Index: ConfigurationTest.java =================================================================== RCS file: /cvsroot/larm/larm/src/test/larm/config/ConfigurationTest.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** ConfigurationTest.java 24 Jun 2003 17:19:32 -0000 1.1 --- ConfigurationTest.java 29 Jul 2003 15:11:41 -0000 1.2 *************** *** 1,95 **** ! /* ! * Created on 24.06.2003 by Administrator ! * ! * $Id$ ! */ ! package larm.config; ! ! import junit.framework.TestCase; ! ! /** ! * ConfigurationTest ! * ! * @author Administrator ! * 24.06.2003 ! */ ! public class ConfigurationTest extends TestCase ! { ! ! /** ! * Constructor for ConfigurationTest. ! * @param arg0 ! */ ! public ConfigurationTest(String arg0) ! { ! super(arg0); ! } ! ! public static void main(String[] args) ! { ! junit.swingui.TestRunner.run(ConfigurationTest.class); ! } ! ! /* ! * Test for String getPropertyAsString(String) ! */ ! public void testGetPropertyAsStringString() ! { ! } ! ! /* ! * Test for String getPropertyAsString(String, String) ! */ ! public void testGetPropertyAsStringStringString() ! { ! } ! ! /* ! * Test for long getPropertyAsLong(String) ! */ ! public void testGetPropertyAsLongString() ! { ! } ! ! /* ! * Test for long getPropertyAsLong(String, long) ! */ ! public void testGetPropertyAsLongStringlong() ! { ! } ! ! public void testGetPropertyAsDouble() ! { ! } ! ! public void testGetPropertyAsBoolean() ! { ! } ! ! public void testGetPropertyAsNrOfBytes() ! { ! } ! ! public void testGetSubConfig() ! { ! } ! ! public void testGetSubConfigList() ! { ! } ! ! /* ! * Test for void Configuration(Node) ! */ ! public void testConfigurationNode() ! { ! } ! ! /* ! * Test for void Configuration(Reader) ! */ ! public void testConfigurationReader() ! { ! } ! ! } --- 1,124 ---- ! package larm.config; ! ! import java.io.Reader; ! import java.io.StringReader; ! ! import junit.framework.TestCase; ! import larm.framework.config.Configuration; ! ! /** ! * ConfigurationTest ! * ! * @author ! * @version $Id$ ! */ ! public class ConfigurationTest extends TestCase ! { ! String xml = null; ! Reader xmlReader = null; ! Configuration c = null; ! ! public void setUp() ! { ! try ! { ! xml = "<?xml version=\"1.0\"?>" + ! "<larm>" + ! " <sources>" + ! " <fileSource>" + ! " <fileset dir=\"c:/larm/test/*.lst\"/>" + ! " <pipeline>testPipe</pipeline>" + ! " </fileSource>" + ! " </sources>" + ! " <pipelines>" + ! " " + ! " </pipelines>" + ! "</larm>"; ! xmlReader = new StringReader(xml); ! c = new Configuration(xmlReader); ! } ! catch(Exception e) ! { ! TestCase.fail(e.toString()); ! } ! } ! ! /** ! * Constructor for ConfigurationTest. ! * @param arg0 ! */ ! public ConfigurationTest(String arg0) ! { ! super(arg0); ! } ! ! public static void main(String[] args) ! { ! junit.swingui.TestRunner.run(ConfigurationTest.class); ! } ! ! /* ! * Test for String getPropertyAsString(String) ! */ ! public void testGetPropertyAsStringString() ! { ! assertEquals("testPipe", c.getProperty("/larm/sources/fileSource/pipeline")); ! } ! ! /* ! * Test for String getPropertyAsString(String, String) ! */ ! public void testGetPropertyAsStringStringString() ! { ! assertEquals("testPipe", c.getProperty("/larm/sources/fileSource/pipeline", "def1")); ! assertEquals("def1", c.getProperty("/larm/sources/fileSource/pipeline111", "def1")); ! } ! ! /* ! * Test for long getPropertyAsLong(String) ! */ ! public void testGetPropertyAsLongString() ! { ! } ! ! /* ! * Test for long getPropertyAsLong(String, long) ! */ ! public void testGetPropertyAsLongStringlong() ! { ! } ! ! public void testGetPropertyAsDouble() ! { ! } ! ! public void testGetPropertyAsBoolean() ! { ! } ! ! public void testGetPropertyAsNrOfBytes() ! { ! } ! ! public void testGetSubConfig() ! { ! } ! ! public void testGetSubConfigList() ! { ! } ! ! /* ! * Test for void Configuration(Node) ! */ ! public void testConfigurationNode() ! { ! } ! ! /* ! * Test for void Configuration(Reader) ! */ ! public void testConfigurationReader() ! { ! } ! } |
From: <ot...@us...> - 2003-07-29 15:11:44
|
Update of /cvsroot/larm/larm/src/test/larm/pipes In directory sc8-pr-cvs1:/tmp/cvs-serv23925/src/test/larm/pipes Modified Files: PipelineTest.java Log Message: - Big bad update, reorganization, etc. Index: PipelineTest.java =================================================================== RCS file: /cvsroot/larm/larm/src/test/larm/pipes/PipelineTest.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** PipelineTest.java 24 Jun 2003 17:20:09 -0000 1.1 --- PipelineTest.java 29 Jul 2003 15:11:41 -0000 1.2 *************** *** 1,38 **** ! /* ! * Created on 23.06.2003 ! * ! * To change the template for this generated file go to ! * Window>Preferences>Java>Code Generation>Code and Comments ! */ ! package larm.pipes; ! ! import junit.framework.TestCase; ! ! /** ! * @author Administrator ! */ ! public class PipelineTest extends TestCase ! { ! ! ! ! /** ! * Constructor for PipelineTest. ! * @param arg0 ! */ ! public PipelineTest(String arg0) ! { ! super(arg0); ! } ! ! public void testSomething() ! { ! // TODO add code here ! } ! ! public static void main(String[] args) ! { ! junit.swingui.TestRunner.run(PipelineTest.class); ! } ! ! } --- 1,29 ---- ! package larm.pipes; ! ! import junit.framework.TestCase; ! ! /** ! * @author ! * @version $Id$ ! */ ! public class PipelineTest extends TestCase ! { ! /** ! * Constructor for PipelineTest. ! * @param arg0 ! */ ! public PipelineTest(String arg0) ! { ! super(arg0); ! } ! ! public void testSomething() ! { ! // TODO add code here ! } ! ! public static void main(String[] args) ! { ! junit.swingui.TestRunner.run(PipelineTest.class); ! } ! } |
From: <ot...@us...> - 2003-07-29 15:11:44
|
Update of /cvsroot/larm/larm/src/java/larm/sources In directory sc8-pr-cvs1:/tmp/cvs-serv23925/src/java/larm/sources Added Files: SourceManager.java Log Message: - Big bad update, reorganization, etc. --- NEW FILE: SourceManager.java --- package larm.sources; import java.util.HashMap; import java.util.Iterator; import java.util.logging.Logger; import larm.framework.Context; import larm.framework.Contextualizable; import larm.framework.Lifecycle; import larm.framework.Startable; import larm.framework.config.ConfigList; import larm.framework.config.Configurable; import larm.framework.config.Configuration; import larm.framework.sources.Source; /** * SourceManager * * @author * @version $Id: SourceManager.java,v 1.1 2003/07/29 15:11:41 otis Exp $ */ public class SourceManager implements Configurable, Startable, Contextualizable { static Logger log = Logger.getLogger(SourceManager.class.getName()); HashMap sources = new HashMap(); public SourceManager() { } /* (non-Javadoc) * @see larm.config.Configurable#configure(larm.config.Configuration) */ public void configure(Configuration conf) { try { log.info("configuring SourceManager"); ConfigList list = conf.getSubConfigList("source"); for (int i = 0; i < list.length(); i++) { Configuration c = list.item(i); String type = c.getProperty("@type"); String name = c.getProperty("@name"); log.info("found source of type " + type + " with name " + name); Class clazz = Class.forName(type); Source p = (Source)clazz.newInstance(); Lifecycle.configure(p, c); sources.put(name, p); } if (list.length() == 0) { log.info("no pipelines to register"); } } catch(InstantiationException e) { } catch(ClassNotFoundException e) { } catch(IllegalAccessException e) { } } /* (non-Javadoc) * @see larm.framework.Startable#start() */ public void start() { for (Iterator it = sources.values().iterator(); it.hasNext();) { Lifecycle.start(it.next()); } } /** * @see larm.framework.Contextualizable#contextualize(larm.framework.Context) */ public void contextualize(Context ctx) { } } |
From: <ot...@us...> - 2003-07-29 15:11:43
|
Update of /cvsroot/larm/larm/src/java/larm/root In directory sc8-pr-cvs1:/tmp/cvs-serv23925/src/java/larm/root Added Files: LARM.java Log Message: - Big bad update, reorganization, etc. --- NEW FILE: LARM.java --- package larm.root; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.util.logging.Level; import java.util.logging.Logger; import larm.config.PropertyManager; import larm.framework.Context; import larm.framework.Contextualizable; import larm.framework.Lifecycle; import larm.framework.Startable; import larm.framework.config.Configurable; import larm.framework.config.Configuration; import larm.pipes.PipelineManager; import larm.sources.SourceManager; import org.xml.sax.SAXException; /** * LARM * * @author * @version $Id: LARM.java,v 1.1 2003/07/29 15:11:41 otis Exp $ */ public final class LARM implements Configurable, Contextualizable, Startable { /** * reads the properties section and puts them to the system properties */ PropertyManager propertyManager = new PropertyManager(); /** * reads the pipes section and registers all pipelines */ PipelineManager pipelineManager = new PipelineManager(); /** * reads the sources section and initializes and starts all sources */ SourceManager sourceManager = new SourceManager(); /** * this context contains the above objects */ Context context = new Context(); /** * @see larm.config.Configurable#configure(larm.config.Configuration) */ public void configure(Configuration conf) { Lifecycle.configure(propertyManager, conf.getSubConfig("properties")); Lifecycle.configure(pipelineManager, conf.getSubConfig("pipes")); Lifecycle.configure(sourceManager, conf.getSubConfig("sources")); } /** * @see larm.framework.Contextualizable#contextualize(larm.framework.Context) */ public void contextualize(Context ctx) { context.set("propertyManager", propertyManager); context.set("sourceManager", sourceManager); context.set("pipelineManager", pipelineManager); Lifecycle.contextualize(propertyManager, context); Lifecycle.contextualize(pipelineManager, context); Lifecycle.contextualize(sourceManager, context); } /** * @see larm.framework.Startable#start() */ public void start() { // Lifecycle.start(propertyManager); Lifecycle.start(pipelineManager); Lifecycle.start(sourceManager); } public static void main(String[] args) { System.out.println("LARM"); if(args.length != 1) { System.out.println("Usage: java larm.root.LARM <configfile.xml>"); } Logger log = Logger.getLogger(""); log.setLevel(Level.CONFIG); try { LARM larm = new LARM(); Configuration config = new Configuration(new FileReader(args[0])); log.info("Configuring..."); Lifecycle.configure(larm, config.getSubConfig("/larm")); log.info("Contextualizing..."); Lifecycle.contextualize(larm, null); log.info("Starting..."); Lifecycle.start(larm); } catch(FileNotFoundException e) { System.out.println("Could not find file: " + args[0]); } catch(IOException e) { System.out.println("I/O Error while reading file: " + args[0]); } catch(SAXException e) { System.out.println("Error while parsing " + args[0] + ": " + e.getMessage()); } } } |
From: <ot...@us...> - 2003-07-29 15:11:43
|
Update of /cvsroot/larm/larm/src/java/larm/framework/sources In directory sc8-pr-cvs1:/tmp/cvs-serv23925/src/java/larm/framework/sources Added Files: Source.java Log Message: - Big bad update, reorganization, etc. --- NEW FILE: Source.java --- package larm.framework.sources; /** * Source * * @author * @version $Id: Source.java,v 1.1 2003/07/29 15:11:40 otis Exp $ */ public interface Source { } |
Update of /cvsroot/larm/larm/src/java/larm/framework/pipes In directory sc8-pr-cvs1:/tmp/cvs-serv23925/src/java/larm/framework/pipes Added Files: BlockingPipeline.java MemoryQueue.java Message.java MessageProcessor.java NonblockingPipeline.java Pipeline.java Queue.java Log Message: - Big bad update, reorganization, etc. --- NEW FILE: BlockingPipeline.java --- package larm.framework.pipes; import java.util.ArrayList; import java.util.logging.Level; import java.util.logging.Logger; import larm.framework.Lifecycle; import larm.framework.Startable; import larm.framework.config.ConfigList; import larm.framework.config.Configurable; import larm.framework.config.Configuration; /** * BlockingPipeline * * @author * @version $Id: BlockingPipeline.java,v 1.1 2003/07/29 15:11:40 otis Exp $ */ public class BlockingPipeline implements Pipeline, Configurable, Startable { static Logger log=null; ArrayList processorList = new ArrayList(); MessageProcessor[] processors; int numProcessors; /** * @see larm.pipes.Pipeline#putMessage(larm.pipes.Message) */ public Message process(Message message) { for(int i = 0; i<numProcessors; i++) { message = processors[i].process(message); if(message == null) { break; } } return message; } /** * @see larm.pipes.Pipeline#addMessageProcessor(larm.pipes.MessageProcessor) */ public void addMessageProcessor(MessageProcessor p) { processorList.add(p); processors = (MessageProcessor[])processorList.toArray(); numProcessors = processors.length; } /** * @see larm.config.Configurable#configure(larm.config.Configuration) */ public void configure(Configuration conf) { if(log == null) { log = Logger.getLogger(this.getClass().getName()); } try { ConfigList processors = conf.getSubConfigList("processor"); for(int i = 0; i<processors.length(); i++) { Configuration processor = processors.item(i); String type = processor.getProperty("@type"); Class clazz = Class.forName(type); MessageProcessor newMP = (MessageProcessor)clazz.newInstance(); Lifecycle.configure(newMP, processor); addMessageProcessor(newMP); } } catch(IllegalArgumentException e) { log.log(Level.SEVERE,"config", e); } catch(ClassNotFoundException e) { log.log(Level.SEVERE,"class not found", e); } catch(IllegalAccessException e) { log.log(Level.SEVERE,"illegal access", e); } catch(ClassCastException e) { log.log(Level.SEVERE,"not a MessageProcessor", e); } catch(InstantiationException e) { log.log(Level.SEVERE,"could not instantiate MessageProcessor", e); } } /** * @see larm.framework.Startable#start() */ public void start() { for(int i = 0; i<processors.length; i++) { Lifecycle.start(processors[i]); } } } --- NEW FILE: MemoryQueue.java --- package larm.framework.pipes; import java.util.LinkedList; /** * MemoryQueue * * @author * @version $Id: MemoryQueue.java,v 1.1 2003/07/29 15:11:40 otis Exp $ */ public class MemoryQueue implements Queue { LinkedList queue = new LinkedList(); int size=0; int used=0; public MemoryQueue() {} public void init(int maxSize) { queue.clear(); size = maxSize; used = 0; } public int getMaxSize() { return size; } public synchronized int length() { return used; } /** * @see larm.pipes.Queue#dequeue() */ public synchronized Object dequeue() { try { if(used == 0) { wait(); } if(used>0) { Object object = queue.removeLast(); used--; notify(); return object; } } catch(InterruptedException e) { assert false; } return null; } /** * @see larm.pipes.Queue#enqueue(java.lang.Object) */ public synchronized void enqueue(Object object) { try { if(used >= size) { wait(); } if(used < size) { queue.addFirst(object); used++; notify(); } return; } catch(InterruptedException e) { assert false; } } } --- NEW FILE: Message.java --- package larm.framework.pipes; /** * Message * * @author * @version */ public abstract class Message { } --- NEW FILE: MessageProcessor.java --- package larm.framework.pipes; /** * MessageProcessor * * @author * @version $Id: MessageProcessor.java,v 1.1 2003/07/29 15:11:40 otis Exp $ */ public interface MessageProcessor { Message process(Message m); } --- NEW FILE: NonblockingPipeline.java --- package larm.framework.pipes; import java.util.logging.Level; import java.util.logging.Logger; import larm.framework.Lifecycle; import larm.framework.Startable; import larm.framework.config.Configurable; import larm.framework.config.Configuration; /** * NonblockingPipeline * * @author * @version */ public class NonblockingPipeline implements Pipeline, Configurable, Startable, Runnable { BlockingPipeline pipe = new BlockingPipeline(); Queue queue; Thread pollingThread; static Logger log = Logger.getLogger(NonblockingPipeline.class.getName()); final static long DEFAULT_QUEUE_SIZE = 1000; /** * @see larm.pipes.Pipeline#addMessageProcessor(larm.pipes.MessageProcessor) */ public void addMessageProcessor(MessageProcessor p) { pipe.addMessageProcessor(p); } /** * will block when the queue is full * @see larm.pipes.Pipeline#putMessage(larm.pipes.Message) */ public synchronized Message process(Message message) { queue.enqueue(message); return message; } /** * @see larm.config.Configurable#configure(larm.config.Configuration) */ public void configure(Configuration conf) { try { if(conf.contains("queue")) { Configuration queueConfig = conf.getSubConfig("queue"); String type = queueConfig.getProperty("@type"); queue = (Queue)Class.forName(type).newInstance(); Lifecycle.configure(queue, queueConfig); } else { queue = new MemoryQueue(); } queue.init((int)conf.getPropertyAsLong("@queueSize", DEFAULT_QUEUE_SIZE)); pipe.configure(conf); } catch(IllegalAccessException e) { log.log(Level.SEVERE, "illegal access", e); } catch(ClassCastException e) { log.log(Level.SEVERE, "not a queue class", e); } catch(InstantiationException e) { log.log(Level.SEVERE, "could not instantiate queue", e); } catch(ClassNotFoundException e) { log.log(Level.SEVERE, "could not find class for queue", e); } } /** * @see larm.framework.Startable#start() */ public void start() { Lifecycle.start(pipe); pollingThread = new Thread(this); pollingThread.start(); } /** * @see java.lang.Runnable#run() */ public void run() { while(true) { Message m = (Message)queue.dequeue(); // blocks when empty pipe.process(m); } } } --- NEW FILE: Pipeline.java --- package larm.framework.pipes; /** * Pipeline * * @author * @version $Id: Pipeline.java,v 1.1 2003/07/29 15:11:40 otis Exp $ */ public interface Pipeline extends MessageProcessor { void addMessageProcessor(MessageProcessor p); } --- NEW FILE: Queue.java --- package larm.framework.pipes; /** * Queue * * @author * @version $Id: Queue.java,v 1.1 2003/07/29 15:11:40 otis Exp $ */ public interface Queue { void init(int maxSize); void enqueue(Object object); Object dequeue(); int getMaxSize(); int length(); } |
From: <ot...@us...> - 2003-07-29 15:11:43
|
Update of /cvsroot/larm/larm/src/java/larm/framework/config In directory sc8-pr-cvs1:/tmp/cvs-serv23925/src/java/larm/framework/config Added Files: ConfigList.java Configurable.java Configuration.java Log Message: - Big bad update, reorganization, etc. --- NEW FILE: ConfigList.java --- package larm.framework.config; import org.w3c.dom.Node; import org.w3c.dom.NodeList; /** * ConfigList * encapsulates org.xml.dom.NodeList. returned by @see{Configuration.getSubConfigList()} * * @author * @version $Id: ConfigList.java,v 1.1 2003/07/29 15:11:40 otis Exp $ */ public class ConfigList { NodeList nl; Node root; protected ConfigList(NodeList nodes, Node rootNode) { nl = nodes; root = rootNode; } /** * @return the number of Configuration objects contained in this list; */ public int length() { return nl.getLength(); } /** * returns a configuration item [will be created from a NodeList object * when this method is called] * @param index index of the item. Must not exceed @see{#getLength()} * @return the configuration */ public Configuration item(int index) { return new Configuration(nl.item(index), root); } } --- NEW FILE: Configurable.java --- package larm.framework.config; /** * Configurable * * @author * @version $Id: Configurable.java,v 1.1 2003/07/29 15:11:40 otis Exp $ */ public interface Configurable { void configure(Configuration conf); } --- NEW FILE: Configuration.java --- package larm.framework.config; import java.io.IOException; import java.io.Reader; import java.util.StringTokenizer; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.TransformerException; import org.apache.xpath.XPathAPI; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; import org.xml.sax.SAXException; /** * Configuration encapsulates an XML file and provides an API similar to the * java.util.Properties class while retaining the benefits of a hierarchical * configuration file. Node contents are accessed via * XPath expressions. Configuration doesn't expose any XML APIs. XPath Node lists * are encapsulated via @see{ConfigList}. * * @author * @version $Id: Configuration.java,v 1.1 2003/07/29 15:11:40 otis Exp $ */ public class Configuration { /** * the node represented by this configuration. */ private Node doc; /** * root node for evaluating absolute XPath expressions */ private Node rootNode; protected NodeList getNodes(String xpath) throws IllegalArgumentException { try { return XPathAPI.selectNodeList(doc, xpath); } catch(TransformerException e) { throw new IllegalArgumentException("illegal argument: '" + xpath + "' (root cause: " + e + ")"); } } protected Node getThisNode() { return doc; } protected Node getNode(String xpath) throws IllegalArgumentException { try { return XPathAPI.selectSingleNode(doc, xpath); } catch(TransformerException e) { throw new IllegalArgumentException("illegal argument: '" + xpath + "' (root cause: " + e + ")"); } } public boolean contains(String xpath) { try { return XPathAPI.selectSingleNode(doc, xpath) != null; } catch(TransformerException e) { throw new IllegalArgumentException("illegal argument: '" + xpath + "' (root cause: " + e + ")"); } } /** * resolves strings like * <ol> * <li>"foo ${my.property} bar". my.property is read * from the systems properties and inserted into the string. * <li>"foo $${/my/xpath} bar". /my/path must be a valid xpath argument * that can be translated into a string. It is resolved using the configuration's * root element * </ol> * resolveProperty works recursively: If a property contains another property * placeholder, up to 10 levels of recursion will be resolved * @param prop * @param recurse * @return */ protected String resolveProperty(String prop, int recurse) { char[] c = new char[prop.length()]; prop.getChars(0,prop.length(),c,0); int l = c.length; StringBuffer s = new StringBuffer(l * 2); final int CHAR = 0; final int DOLLAR = 1; final int DOLLAR2 = 2; int start = 0; final int PROPERTY = 3; final int XPATH = 4; StringBuffer v = new StringBuffer(); int mode = CHAR; for(int i = 0; i<c.length; i++) { char ac = c[i]; switch(mode) { case CHAR: if(ac == '$') { mode = DOLLAR; start = i; } else { s.append(ac); } break; case DOLLAR: if(ac == '{') { mode = PROPERTY; } else if(ac == '$') { mode = DOLLAR2; } else { mode = CHAR; s.append('$'); s.append(ac); } break; case DOLLAR2: if(ac == '{') { mode = XPATH; } else { mode = CHAR; s.append('$'); s.append('$'); } break; case PROPERTY: if(ac == '}') { // got Java property. resolve it. String varName = new String(c,start + 2,i-start-2); String value = getJavaPropertyAsStringDontResolve(varName); if(recurse == 0) { throw new IllegalStateException("recursion limit reached while resolving " + varName + " -> " + value); } String p = resolveProperty(value,recurse-1); s.append(p); mode = CHAR; } case XPATH: if(ac == '}') { // got XPath expression. resolve it. String varName = new String(c,start + 2,i-start-2); String value = getPropertyAsStringDontResolve(rootNode,varName); if(recurse == 0) { throw new IllegalStateException("recursion limit reached while resolving " + varName + " -> " + value); } String p = resolveProperty(value,recurse-1); s.append(p); mode = CHAR; } } } if(mode != CHAR) { s.append(c,start,l-start); } return s.toString(); } protected String getJavaPropertyAsStringDontResolve(String varName) { return System.getProperty(varName, ""); } public String getPropertyAsStringDontResolve(String xpath) throws IllegalArgumentException { return getPropertyAsStringDontResolve(doc, xpath); } protected String getPropertyAsStringDontResolve(Node root, String xpath) throws IllegalArgumentException { try { return XPathAPI.eval(root, "string(" + xpath + ")").str(); } catch(TransformerException e) { throw new IllegalArgumentException("illegal argument: '" + xpath + "' (root cause: " + e + ")"); } } /** * returns a property as a string or null if the property was not set * @param xpath an XPath expression like '/foo[@bar="value"]' * @return a string, empty if expression didn't exist * @throws IllegalArgumentException if there was a problem with the XPath * @throws IllegalStateException if properties couldn't be resolved (e.g. recursion level reached) */ public String getProperty(String xpath) throws IllegalArgumentException, IllegalStateException { return resolveProperty(getPropertyAsStringDontResolve(doc,xpath), 10); } /** * returns a property as a string or the default value if the property wasn't set * @param xpath an XPath expression like '/foo[@bar="value"]' * @return a string * @throws IllegalArgumentException if there was a problem with the XPath */ public String getProperty(String xpath, String def) throws IllegalArgumentException { String p = getProperty(xpath); return !"".equals(p) ? p : def; } /** * returns a property as a long or 0 if it was not set * @param xpath an XPath expression like './foo[@bar="value"]' * @return the value or 0 * @throws IllegalArgumentException if there was a problem with the XPath */ public long getPropertyAsLong(String xpath) throws IllegalArgumentException, NumberFormatException { return getPropertyAsLong(xpath, 0); } /** * returns a property as a long or the default value if it was not set * @param xpath an XPath expression like './foo[@bar="value"]' * @return the value * @throws IllegalArgumentException if there was a problem with the XPath */ public long getPropertyAsLong(String xpath, long def) throws IllegalArgumentException, NumberFormatException { try { return Long.parseLong(XPathAPI.eval(doc, "string(" + xpath + ")").str()); } catch(TransformerException e) { throw new IllegalArgumentException("illegal argument: '" + xpath + "' (root cause: " + e + ")"); } catch(NullPointerException npe) { return def; } } /** * returns a property as a double or 0 if it was not set * @param xpath an XPath expression like './foo[@bar="value"]' * @return the value * @throws IllegalArgumentException if there was a problem with the XPath */ public double getPropertyAsDouble(String xpath) throws IllegalArgumentException { try { return Double.parseDouble(XPathAPI.eval(doc, "string(" + xpath + ")").str()); } catch(TransformerException e) { throw new IllegalArgumentException("illegal argument: '" + xpath + "' (root cause: " + e + ")"); } } /** * returns a property as a boolean or the default value if it was not set * @param xpath an XPath expression like './foo[@bar="value"]' * @param deflt the default value * @return the value * @throws IllegalArgumentException if there was a problem with the XPath */ public boolean getPropertyAsBoolean(String xpath, boolean deflt) throws IllegalArgumentException { try { return Boolean.valueOf(XPathAPI.eval(doc, "string(" + xpath + ")").str()).booleanValue(); } catch(TransformerException e) { throw new IllegalArgumentException("illegal argument: '" + xpath + "' (root cause: " + e + ")"); } catch(NullPointerException e) { return deflt; } } /** * returns a property as a long value indicating a number of bytes or the default * value if the property was not set. * The input string must conform to (NUMBER (BYTES|KB|MB|GB))+ * with BYTES= EMPTY|b|byte|bytes, * KB = k|kb|kbyte|kbytes|kilobyte|kilobytes * MB = m|mb|mbyte|mbytes|megabyte|megabytes * GB = g|gb|gbyte|gbytes|gigabyte|gigabytes * and EMPTY = the empty string<p> * example * <ul> * <li>"2000" (2000 bytes) * <li>"10 kb" (or "10 k", "1 kbyte", "10 kbytes", "1 kilobyte", "10 kilobytes") * <li>"3.4 mb" (or "m", "mbyte", "mbytes", "megabyte", "megabytes") * <li>"0.3 gb" (or "g", "gbyte", "gbytes", "gigabyte", "gigabytes") * </ul> * which are all resolved to their byte values (we say k = kb = kilo = 1024 * although this is not perfectly correct since kilo = 1000; the same applies * to mb or gb)<p> * Tokens are separated by whitespace * @param xpath an XPath expression like './foo[@bar="value"]' * @param deflt the default value * @return the value * @throws IllegalArgumentException if there was a problem with the XPath */ public long getPropertyAsNrOfBytes(String xpath, long deflt) throws IllegalArgumentException { try { return parseNrOfBytes(XPathAPI.eval(doc, "string(" + xpath + ")").str()); } catch(TransformerException e) { throw new IllegalArgumentException("illegal argument: '" + xpath + "' (root cause: " + e + ")"); } catch(NullPointerException e) { return deflt; } } private long parseNrOfSeconds(String s) { StringTokenizer t = new StringTokenizer(s); long ret = 0; while(t.hasMoreTokens()) { double t1 = Double.parseDouble(t.nextToken()); int mult = 1; if(t.hasMoreTokens()) { String type = t.nextToken().toLowerCase(); if("d".equals(type) || "day".equals(type) || "days".equals(type)) { mult = 3600 * 24; } if("h".equals(type) || "hours".equals(type)) { mult = 3600; } else if("m".equals(type) || "min".equals(type) || "mins".equals(type) || "minutes".equals(type)) { mult = 60; } else if(!("s".equals(type) || "sec".equals(type) || "seconds".equals(type))) { throw new IllegalArgumentException("s|sec|seconds|m|min|mins|minutes|h|hours expected. (Argument was: '" + type + "')"); } } ret += (long)(t1 * mult); } return ret; } protected long parseNrOfBytes(String s) { StringTokenizer t = new StringTokenizer(s); long ret = 0; while(t.hasMoreTokens()) { double t1 = Double.parseDouble(t.nextToken()); long mult = 1; if(t.hasMoreTokens()) { String type = t.nextToken().toLowerCase(); if("k".equals(type) || "kb".equals(type) || "kbyte".equals(type) || "kbytes".equals(type)) { mult = 1024; } else if("mb".equals(type) || "mbyte".equals(type) || "mbytes".equals(type) || "megabyte".equals(type) || "megabytes".equals(type)) { mult = 1024 * 1024; } else if("gb".equals(type) || "gbyte".equals(type) || "gbytes".equals(type) || "gigabyte".equals(type) || "gigabytes".equals(type)) { mult = 1024 * 1024 * 1024; } else { throw new IllegalArgumentException("k|kb|kbyte|kbytes|mb|mbyte|mbytes|megabyte|megabytes|gb|gbyte|gbytes|gigabytes expected. (Argument was: '" + type + "')"); } } ret += (long)(t1 * mult); } return ret; } /** * returns a property as a long value indicating a number of seconds or the default * value if the property was not set. * The input string must conform to (NUMBER (SECS|MINS|HOURS|DAYS))+ * with SECS= EMPTY|s|sec|secs|second|seconds, * MINS = m|min|mins|minutes and * HOURS = h|hour|hours * DAYS = d|day|days * where EMPTY is the empty string. Tokens are separated by whitespace * examples: * <ul> * <li>"30" (30 seconds) * <li>"2 mins 10 secs" * <li>"3.5 hours" * <li>"1 day" * </ul> * @param xpath * @return the value in seconds */ protected long getPropertyAsNrOfSeconds(String xpath, long dfault) throws IllegalArgumentException { try { return parseNrOfSeconds(XPathAPI.eval(doc, "string(" + xpath + ")").str()); } catch(TransformerException e) { throw new IllegalArgumentException("illegal argument: '" + xpath + "' (root cause: " + e + ")"); } catch(NullPointerException n) { return dfault; } } /** * returns a configuration object representing the sub-graph of the given XML * tree * @param xpath an expression denoting a node that contains a sub-tree. * @return the configuration or null if the node doesn't exist * @throws IllegalArgumentException if x */ public Configuration getSubConfig(String xpath) throws IllegalArgumentException { Node n = getNode(xpath); if(n == null) { //return null; throw new IllegalArgumentException("expected tag '" + xpath + "' in configuration file"); } return new Configuration(n, rootNode); } /** * returns a list of Configuration objects representing sub-graphs of the * given XML tree, specified by an xPath expression * @param xpath an expression denoting a list of nodes that contain a sub-tree. * @return the configuration or null if the node doesn't exist * @throws IllegalArgumentException if x */ public ConfigList getSubConfigList(String xpath) throws IllegalArgumentException { NodeList nl = getNodes(xpath); if(nl == null) { throw new IllegalArgumentException("node '" + xpath + "' does not exist"); } return new ConfigList(nl, rootNode); } protected Configuration(Node node, Node root) { this.doc = node; this.rootNode = root; } /** * Constructor. Node will be the root node * @param node root node */ public Configuration(Node node) { doc = rootNode = node; } /** * Constructor. config must contain a valid XML file. * @param config the XML file this Configuration represents. * @throws IOException * @throws SAXException */ public Configuration(Reader config) throws IOException, SAXException { try { // inspired by http://cafeconleche.org/books/xmljava/chapters/ch16s06.html DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); DocumentBuilder builder = null; factory.setNamespaceAware(true); builder = factory.newDocumentBuilder(); InputSource data = new InputSource(config); doc = rootNode = builder.parse(data); } catch(ParserConfigurationException e) { throw new RuntimeException("Couldn't initialize parser", e); } } } |
From: <ot...@us...> - 2003-07-29 15:11:43
|
Update of /cvsroot/larm/larm/src/java/larm/framework In directory sc8-pr-cvs1:/tmp/cvs-serv23925/src/java/larm/framework Added Files: Context.java Contextualizable.java Lifecycle.java Startable.java Stoppable.java Log Message: - Big bad update, reorganization, etc. --- NEW FILE: Context.java --- package larm.framework; import java.util.HashMap; import java.util.Iterator; /** * Context * * @author * @version $Id: Context.java,v 1.1 2003/07/29 15:11:40 otis Exp $ */ public class Context { private HashMap context = new HashMap(); public void set(String key, Object value) { context.put(key, value); } public Object get(String key) { return context.get(key); } public Iterator keys() { return context.keySet().iterator(); } } --- NEW FILE: Contextualizable.java --- package larm.framework; /** * Contextualizable * * @author * @version $Id: Contextualizable.java,v 1.1 2003/07/29 15:11:40 otis Exp $ */ public interface Contextualizable { public void contextualize(Context ctx); } --- NEW FILE: Lifecycle.java --- package larm.framework; import larm.framework.config.Configurable; import larm.framework.config.Configuration; /** * LifecycleHelper * * @author * @version $Id: Lifecycle.java,v 1.1 2003/07/29 15:11:40 otis Exp $ */ public class Lifecycle { public static void configure(Object o, Configuration c) { if(o instanceof Configurable) { ((Configurable)o).configure(c); } } public static void start(Object o) { if(o instanceof Startable) { ((Startable)o).start(); } } public static void stop(Object o) { if(o instanceof Stoppable) { ((Stoppable)o).stop(); } } public static void contextualize(Object o, Context c) { if(o instanceof Contextualizable) { ((Contextualizable)o).contextualize(c); } } } --- NEW FILE: Startable.java --- package larm.framework; /** * Startable * * @author * @version $Id: Startable.java,v 1.1 2003/07/29 15:11:40 otis Exp $ */ public interface Startable { void start(); } --- NEW FILE: Stoppable.java --- package larm.framework; /** * Stoppable * * @author * @version $Id: Stoppable.java,v 1.1 2003/07/29 15:11:40 otis Exp $ */ public interface Stoppable { void stop(); } |
From: <ot...@us...> - 2003-07-29 15:11:42
|
Update of /cvsroot/larm/larm/src/java/larm/config In directory sc8-pr-cvs1:/tmp/cvs-serv23925/src/java/larm/config Modified Files: PropertyManager.java Removed Files: ConfigList.java Configuration.java Log Message: - Big bad update, reorganization, etc. Index: PropertyManager.java =================================================================== RCS file: /cvsroot/larm/larm/src/java/larm/config/PropertyManager.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** PropertyManager.java 24 Jul 2003 11:48:08 -0000 1.1 --- PropertyManager.java 29 Jul 2003 15:11:40 -0000 1.2 *************** *** 1,6 **** - /* - * - * $Id$ - */ package larm.config; --- 1,2 ---- *************** *** 18,31 **** * * @author */ public class PropertyManager implements Configurable { ! static Logger log = Logger.getLogger(PropertyManager.class.getName());; ! public PropertyManager() { } ! /** * @see larm.config.Configurable#configure(larm.config.Configuration) --- 14,28 ---- * * @author + * @version $Id$ */ public class PropertyManager implements Configurable { ! static Logger log = Logger.getLogger(PropertyManager.class.getName());; ! public PropertyManager() { } ! /** * @see larm.config.Configurable#configure(larm.config.Configuration) *************** *** 43,51 **** System.setProperty(name, value); } ! props = conf.getSubConfigList("property[@file]"); for(int i = 0; i < props.length(); i++) { ! String fileName=null; try { --- 40,48 ---- System.setProperty(name, value); } ! props = conf.getSubConfigList("property[@file]"); for(int i = 0; i < props.length(); i++) { ! String fileName=null; try { --- ConfigList.java DELETED --- --- Configuration.java DELETED --- |
From: <ot...@us...> - 2003-07-29 15:11:42
|
Update of /cvsroot/larm/larm/src/config In directory sc8-pr-cvs1:/tmp/cvs-serv23925/src/config Added Files: empty.xml Log Message: - Big bad update, reorganization, etc. --- NEW FILE: empty.xml --- <?xml version="1.0"?> <larm> <properties> <property name="pipe1name" value="pi"/> <property file="my.properties"/> </properties> <sources> </sources> <pipes> <nonblockingPipeline name="${pipe1name}"> </nonblockingPipeline> </pipes> </larm> |
From: <ot...@us...> - 2003-07-29 15:10:43
|
Update of /cvsroot/larm/larm/src/java/larm/pipes In directory sc8-pr-cvs1:/tmp/cvs-serv23663/src/java/larm/pipes Added Files: PipelineManager.java Log Message: - Initial checkin. --- NEW FILE: PipelineManager.java --- /* * Created on 30.06.2003 by Administrator * * $Id: PipelineManager.java,v 1.1 2003/07/29 15:10:40 otis Exp $ */ package larm.pipes; import java.util.HashMap; import java.util.Iterator; import java.util.logging.Logger; import larm.framework.Lifecycle; import larm.framework.Startable; import larm.framework.config.ConfigList; import larm.framework.config.Configurable; import larm.framework.config.Configuration; import larm.framework.pipes.NonblockingPipeline; import larm.framework.pipes.Pipeline; /** * PipelineManager * * @author Administrator * 30.06.2003 */ public class PipelineManager implements Configurable, Startable { static Logger log = Logger.getLogger(PipelineManager.class.getName()); HashMap pipes = new HashMap(); public Pipeline getPipeline(String name) { return (Pipeline)pipes.get(name); } /** * @see larm.config.Configurable#configure(larm.config.Configuration) */ public void configure(Configuration conf) { ConfigList list = conf.getSubConfigList("nonblockingPipeline | blockingPipeline"); for(int i = 0; i<list.length(); i++) { Configuration c = list.item(i); String type = c.getProperty("name()"); String name = c.getProperty("@name"); log.info("found pipe of type " + type + " with name " + name); if("nonBlockingPipeline".equals(type)) { NonblockingPipeline p = new NonblockingPipeline(); Lifecycle.configure(p, c); pipes.put(name, p); } else if("blockingPipeline".equals(type)) { NonblockingPipeline p = new NonblockingPipeline(); Lifecycle.configure(p, c); pipes.put(name, p); } else { assert false; } } if(list.length() == 0) { log.info("no pipelines to register"); } } /** * @see larm.framework.Startable#start() */ public void start() { for(Iterator it = pipes.values().iterator(); it.hasNext();) { Lifecycle.start(it.next()); } } } |
From: <ot...@us...> - 2003-07-24 12:26:51
|
Update of /cvsroot/larm/larm/src/java/larm/framework/sources In directory sc8-pr-cvs1:/tmp/cvs-serv32549/src/java/larm/framework/sources Log Message: Directory /cvsroot/larm/larm/src/java/larm/framework/sources added to the repository |
From: <ot...@us...> - 2003-07-24 12:26:51
|
Update of /cvsroot/larm/larm/src/java/larm/framework/pipes In directory sc8-pr-cvs1:/tmp/cvs-serv32549/src/java/larm/framework/pipes Log Message: Directory /cvsroot/larm/larm/src/java/larm/framework/pipes added to the repository |
From: <ot...@us...> - 2003-07-24 12:26:51
|
Update of /cvsroot/larm/larm/src/java/larm/framework/config In directory sc8-pr-cvs1:/tmp/cvs-serv32549/src/java/larm/framework/config Log Message: Directory /cvsroot/larm/larm/src/java/larm/framework/config added to the repository |
From: <ot...@us...> - 2003-07-24 12:22:55
|
Update of /cvsroot/larm/larm/src/java/larm/sources In directory sc8-pr-cvs1:/tmp/cvs-serv31947/src/java/larm/sources Log Message: Directory /cvsroot/larm/larm/src/java/larm/sources added to the repository |
From: <ot...@us...> - 2003-07-24 12:22:55
|
Update of /cvsroot/larm/larm/src/java/larm/framework In directory sc8-pr-cvs1:/tmp/cvs-serv31947/src/java/larm/framework Log Message: Directory /cvsroot/larm/larm/src/java/larm/framework added to the repository |
From: <ot...@us...> - 2003-07-24 12:22:55
|
Update of /cvsroot/larm/larm/src/test/larm/framework In directory sc8-pr-cvs1:/tmp/cvs-serv31947/src/test/larm/framework Log Message: Directory /cvsroot/larm/larm/src/test/larm/framework added to the repository |
From: <ot...@us...> - 2003-07-24 12:22:55
|
Update of /cvsroot/larm/larm/src/config In directory sc8-pr-cvs1:/tmp/cvs-serv31947/src/config Log Message: Directory /cvsroot/larm/larm/src/config added to the repository |
From: <ot...@us...> - 2003-07-24 12:22:55
|
Update of /cvsroot/larm/larm/src/java/larm/root In directory sc8-pr-cvs1:/tmp/cvs-serv31947/src/java/larm/root Log Message: Directory /cvsroot/larm/larm/src/java/larm/root added to the repository |
From: <ot...@us...> - 2003-07-24 12:18:21
|
Update of /cvsroot/larm/larm/docs In directory sc8-pr-cvs1:/tmp/cvs-serv31326/docs Modified Files: contents.txt crawler.txt framework.txt indexer.txt packages.txt processors.txt Log Message: - Updated. Index: contents.txt =================================================================== RCS file: /cvsroot/larm/larm/docs/contents.txt,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** contents.txt 24 Jun 2003 17:50:44 -0000 1.5 --- contents.txt 24 Jul 2003 12:18:17 -0000 1.6 *************** *** 1,85 **** ! ! Specification Document for LARM. ! ! $Id$ ! ! Log: ! ---------------+-----------+--------------------------------------------------- ! cmarschn 10-Jun-03 Created. Will write all but the parts in () ! cmarschn 11-Jun-03 Added sections for framework, extended crawler, ! common development patterns ! cmarschn 15-Jun-03 Worked on the crawler part, wrote framwork ! cmarschn 20-Jun-03 ! cmarschn 23-Jun-03 ! ---------------+-----------+--------------------------------------------------- ! ! ! Contents ! ! ------------------------------------------------------------------------------- ! ! [Part I: Framework] framework.txt ! ! I. Messaging Framework framework.txt ! 1. Pipelines ! 2. Sources and Drains ! 3. Notifications or Polling ! 4. Batch file operation ! 5. Batch file indexing ! ! II. Configuration framework.txt ! 1. XML Configuration ! 2. Configuration files ! 3. Startup/Shutdown ! ! [Part II: Gatherers] ! ! III. Crawler crawler.txt ! 1. Crawl Requests ! 3. DNS Handling ! 4. Robot Exclusion ! 5. Link Analysis ! 6. Distribution ! 7. Persistence ! 8. Configuration ! 9. Log File(s) ! 10. Recrawls ! ! (IV. File System Gatherer) ! 1. Configuration ! 2. Reindexing ! ! (V. Database Gatherer) ! ! (VI. Other Sources (JMS, Mail, Web Services...)) ! ! [Part III: Record Processors] processors.txt ! ! VII. Format conversion (PDF, Word, HTML etc.) ! VIII. Link Extraction ! IX. Distribution to different index fields ! X. Applying link analysis to document weights ! ! [Part IV: Indexer] indexer.txt ! ! XI. The Indexer ! 1. Message formats ! 2. Persistence ! 3. Configuration ! 4. Log File(s) ! ! ([Part V: Search]) ! ! (XII. Search interface) ! (XIII. Data Display) ! ! [Part VI: Common Development Patterns] ! XIV. Logging ! XV. Test Cases ! XVI. Package layout ! ! [Part VII: Appendix] ! ! XVII. Used Packages packages.txt ! XVIII. Glossary --- 1,85 ---- ! ! Specification Document for LARM. ! ! $Id$ ! ! Log: ! ---------------+-----------+--------------------------------------------------- ! cmarschn 10-Jun-03 Created. Will write all but the parts in () ! cmarschn 11-Jun-03 Added sections for framework, extended crawler, ! common development patterns ! cmarschn 15-Jun-03 Worked on the crawler part, wrote framwork ! cmarschn 20-Jun-03 ! cmarschn 23-Jun-03 ! ---------------+-----------+--------------------------------------------------- ! ! ! Contents ! ! ------------------------------------------------------------------------------- ! ! [Part I: Framework] framework.txt ! ! I. Messaging Framework framework.txt ! 1. Pipelines ! 2. Sources and Drains ! 3. Notifications or Polling ! 4. Batch file operation ! 5. Batch file indexing ! ! II. Configuration framework.txt ! 1. XML Configuration ! 2. Configuration files ! 3. Startup/Shutdown ! ! [Part II: Gatherers] ! ! III. Crawler crawler.txt ! 1. Crawl Requests ! 3. DNS Handling ! 4. Robot Exclusion ! 5. Link Analysis ! 6. Distribution ! 7. Persistence ! 8. Configuration ! 9. Log File(s) ! 10. Recrawls ! ! (IV. File System Gatherer) ! 1. Configuration ! 2. Reindexing ! ! (V. Database Gatherer) ! ! (VI. Other Sources (JMS, Mail, Web Services...)) ! ! [Part III: Record Processors] processors.txt ! ! VII. Format conversion (PDF, Word, HTML etc.) ! VIII. Link Extraction ! IX. Distribution to different index fields ! X. Applying link analysis to document weights ! ! [Part IV: Indexer] indexer.txt ! ! XI. The Indexer ! 1. Message formats ! 2. Persistence ! 3. Configuration ! 4. Log File(s) ! ! ([Part V: Search]) ! ! (XII. Search interface) ! (XIII. Data Display) ! ! [Part VI: Common Development Patterns] ! XIV. Logging ! XV. Test Cases ! XVI. Package layout ! ! [Part VII: Appendix] ! ! XVII. Used Packages packages.txt ! XVIII. Glossary Index: crawler.txt =================================================================== RCS file: /cvsroot/larm/larm/docs/crawler.txt,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** crawler.txt 24 Jun 2003 17:51:48 -0000 1.5 --- crawler.txt 24 Jul 2003 12:18:17 -0000 1.6 *************** *** 1,272 **** ! ! $Id$ ! ! ------------------------------------------------------------------------------- ! III. The Crawler ! ! The crawler contains a special type of pipeline whose configuration is very ! limited. The reason is that the crawler parts use some shared data structures ! and contain some internal dependencies (e.g. the order in which different ! processing steps are done). Nevertheless we decided to keep up the pipeline ! paradigm to separate concerns into different classes and to avoid a large ! "Crawler" class that contains such different operations like Document fetching ! and robot exclusion. ! ! 1. The Fetcher: Crawl Requests and Crawler Output ! ! The Fetcher sits at the core of the crawler. It takes CrawlRequests and outputs ! raw CrawlRecords. ! ! A crawl request consists of the following fields: ! ! method: one of NEW or CHECK_FOR_RECRAWL [or CHECK_FOR_SERVER_RUNNING]. NEW ! loads a document as given by a URL. If network errors occur the fetcher can ! be parameterized to wait and retry a couple of times, set the host to ! BAD_STATE if unsuccessful, and output a SERVER_PROBLEM message. CHECK checks ! the document for changes (in the MD5). The crawler may behave differently when ! CHECK_FOR_RECRAWL or CHECK_FOR_SERVER_RUNNING was chosen. In the latter case ! the crawler may decide to check the host only once. In the case of CHECK an ! MD5 checksum has to be provided. [When using CHECK we don't look for changed ! Dates since they have proven to be unreliable.] ! ! url: URL: The URL of the document to crawl ! ! MD5Hash: MD5Hash: An MD5 Hash of the document, if method is CHECK ! ! interface CrawlRequest { ! ! // enum RequestMethod ! final static byte NEW = 1; ! final static byte CHECK_FOR_RECRAWL = 2; ! ! byte requestMethod; // type RequestMethod ! URL url; ! [long lastModified;] // if CHECK_FOR_RECRAWL, can be sent as ! If-Modified-Since] ! MD5Hash MD5Hash; // set to null if requestMethod == NEW ! } ! ! A CrawRecord is the output of a crawler that contains the raw document as loaded ! by the crawler threads. It contains the following fields: ! ! url: URL: The original URL of the crawl ! ! finalURL: URL: The final URL if HTTP responds with 30x result codes. The ! crawler can be configured a maximum number of detours to take if such a result ! code occurs. ! ! requestMethod: byte. request method as in CrawlRequest ! ! fingerprint: MD5Hash. hash value of the document contents. ! ! HTTPstatus: short. The HTTP status code as returned by the last try. e.g. 200 ! ! crawlerStatus: short. error code if not reflected through the HTTPStatus code ! ! MIMEType: String. The MIME type of the document loaded. e.g. "text/html" ! ! encoding: String. The document encoding if provided ! ! lastModified: Date. time when the doc was last changed. ! ! headers: the HTTP headers returned ! ! encoding: String. Content-Encoding as specified in a HTTP header ! ! contents: Object. Either a byte[] or a char[] depending on the MIME type and ! encoding. Since HTML or XML files themselves may contain an "encoding" ! attribute on their own the fetcher doesn't make any assumptions on the real ! content tyspe. ! ! interface CrawlRecord { ! ! // crawler status ! final static byte CS_OK 0 // if HTTPStatus == 200 ! final static byte CS_ERROR_IN_HTTP 1 // if HTTPStatus != 200 ! final static byte CS_TOO_MANY_REDIRECTS 2 // e.g. 301/302 redirect loop ! final static byte CS_UNKNOWN_HOST 3 // host name doesn't exist ! final static byte CS_HOST_NOT_REACHABLE 4 // server not running ! final static byte CS_READ_TIMEOUT 5 // server or network too slow ! final static byte CS_NO_ROUTE_TO_HOST 6 // network problem ! // (NoRouteToHostException) ! final static byte CS_PORT_CLOSED 7 // no server running on this ! // port (ConnectException) ! final static byte CS_FILE_TOO_LARGE 8 // file exceeded maximum size ! // and was truncated ! final static byte CS_IO_EXCEPTION 100 // unknown IO exception ! ! URL url; // -> IndexRecord.URI ! URL finalURL; // -> IndexRecord.secondaryURIs[0] ! byte requestMethod; // see above ! MD5Hash fingerprint; // -> IndexRecord.fingerprint ! short HTTPStatus; // HTTP status code ! byte crawlerStatus; // that are not reflected in HTTPStatus ! String MIMEType; // IndexRecord.MIMEType ! String[][] headers; // HTTP headers ! String encoding; // ISO, UTF, Base64, Gzip, etc. ! long lastModified; // same as in CrawlRequest if not modified, else timestamp ! byte[] contents; ! } ! ! The Fetcher is controlled by a FetcherManager that distributes CrawlRequests ! among different threads. The threads get batches of crawl requests if available ! to minimize synchronization. They can also be configured such that they collect ! a couple of documents before they put them to the output queue. ! ! We will use the hashCode of the hostname modulo the number of threads to assign ! fetches to the different threads. Each thread will have a priority queue and a ! small host name cache for the incoming requests. (for the start we will use ! Javas built-in host name cache). This way a thread can do its work without the ! need to communicate with or block other threads. ! ! The priority queue is used to keep hosts in a wait state while new hosts are ! crawled. Each time a page is crawled from a host it will come into a wait state ! for a configurable threshold until the next request is issued. ! ! [If implemented using non blocking-IO it may also be that a thread keeps ! downloading more than one host at once. This is presumably faster since it saves ! a lot of threads and with it the task switching overhead. The old IO also needs ! the data to be copied a couple of times. The best implementation still has to be ! figured out. Presumably a set of Fetcher threads that are responsible for a ! number of hosts and use non-blocking IO will show the best performance.] ! ! The Fetcher tries to be completely bound to network I/O and will not perform ! extractions if the content is compressed (that is, it sends an "accept-encoding: ! gzip" message if configured but will not perform a decompression step). ! ! The Fetcher also has to keep track of the hosts. Since it cannot hold infos ! about all hosts in RAM, a (LRU) caching mechanism has to be used that contains ! the following information for each host (HostInfo): ! ! hostName: String: a DNS name as an identifier ! IP-Address: InetAddrss: IP-Adress of this host ! ipExpires: long: Expiry time for the IP cache ! robots: : a data structure that is used by a RobotsTxtFilter ! robotsExpires: long: a time that defines when robots.txt has to be reloaded ! ! ! interface HostInfo { ! String hostName; ! InetAddress ipAddress; ! long ipExpires ! ? robots; ! long robotsExpires; ! } ! ! Since HostInfos are looked up using their hostNames they should be stored in a ! simple hash with the hostName as its key. ! ! From the caching point of view it is advisable that incoming CrawlRequests are ! not evenly distributed over the host name space. From a network efficiency point ! of view exactly this should be the case. This conflict may be resolved in the ! following way: Say a batch of CrawlRequests contains a maximum of 5000 hosts and ! a maximum of 100,000 requests. If one of these numbers is exceeded the batch is ! cut into several pieces that each obeys these rules. Then the HostInfo cache can ! be as large as the number of hosts in the batch and may only need to access ! secondary storage as a new batch is started. It can be implemented as a simple ! LRU cache. ! ! 3. DNS Handling ! ! Since DNS resolution takes a lot of time it is advisable to store ipAddresses of ! host names in the HostInfo structure. This calls for a URL implementation that ! doesn't do a resolution on its own, and an HTTP 1.1 implementation that can use ! the ipAddress as given in the HostInfo structure. ! ! For now we use Jakarta HTTPClient which doesn't do address resolution. ! Currently DNS resolution works the following way: ! a) a request to open a connection is sent to HTTPClient ! b) HTTPClient creates a new java.net.Socket with the host name as its argument ! c) if the host name is an IP address, Socket opens it directly. ! If it is a DNS name, ! d) Socket calls getCachedAddress() ! e) getCachedAddress will first perform a linear scan through its host name list ! to see whether resolved names have expired ! f) the host name is looked up in the cache. If it is not found, it is resolved ! through an internal Naming Service class and saved to the cache. ! Since e) takes linear time even when the name is in the cache it unnecessarily ! slows things down if we have 1000s of host names in the cache. In this case ! we would have to resolve the IP address for ourselves, or HTTPClient would have ! to do it, since it later needs the host name for sending an HTTP 1.1 request. ! ! ! 4. Robot Exclusion ! ! Since the incoming CrawlRequests may have been generated a long time ago the ! fetcher has to take care about changed robot exclusion policies while it is ! fetching the documents. For this sake a filter has to be applied shortly before ! a request is made to the server, and robots.txt files have to be reloaded before ! the first request to a server is made and after a specific time has elapsed. ! ! 6. Persistence ! ! CrawlRequests are usually performed in batches that are read from secondary ! storage. These files again may contain a large number of requests that are read ! in steps of <n> requests as specified in the config. Fast crawls demand for a ! large number of hosts in these files and for avoidance of the same hosts in ! subsequent URLs [see Shapenyuk/Suel]. ! ! CrawlRecords again are also written in batches of <n> records and are also ! distributed among several files. They may also be distributed among different ! directories in order to use NFS as a cheap distribution mechanism for the ! indexing step. ! ! ! 7. Distribution ! ! A Fetcher/FetcherManager combination can be distributed among different hosts if ! extracted links are divided such that a node is made responsible for a distinct ! set of hosts. The communication between different crawler nodes takes place in ! batches. To avoid a central component that distributes these Collections of ! CrawlRequests, each node has to know about the other nodes and which hosts this ! node addresses. ! It seems viable to use the hash value of the hostname of the URL to be crawled ! to split this up. But this is supposed to be done in a processing component like ! directly after link extraction. In the Shkabenyuk/Suel this is named the ! "crawling application". Thus it is not part of the crawler itself. ! ! For ease of use the crawler should adapt if a new crawler node is added. Say ! there are three nodes, and all crawl requests are divided into three queues that ! are distributed to these nodes. If a new node is started, the crawling ! application should get a message and start dividing the URLs into four pieces. ! On the other hand, if more than one crawling application is needed, the fetchers ! need to know where to send the downloaded files. This again could be divided by ! the URL. A similar mechanism should apply. ! ! 8. Configuration ! ! - FetcherManager ! - method: NIO or old IO ! - number of threads ! - NIO: number of concurrent requests (=concurrent hosts) per thread ! - number of seconds between subsequent requests to a host ! - number of redirects to follow after page is quit with TOO_MANY_REDIRECTS ! - maximum file size ! - number of seconds to wait for a server to send the file completely ! - HTTP User Agent String ! - size of host name cache ! - size of temp cache for loading docs ! - use "Accept-Encoding: gzip [Compress, Deflate?]" ! ! 9. From CrawlRecords to IndexRecords ! ! Crawl- and IndexRecords seem to be pretty similar, but in fact they differ in a ! variety of features. ! An IndexRecord is crawl-agnostic. It is used for different document sources and ! thus doesn't know about HTTP status codes and the like. ! ! [There will be a converter between Crawl- and IndexRecords at some time in the ! pipeline. This will be configurable such that CrawlRecord entries may become ! generic fields within an IndexRecord] ! ! 9. Log Files ! ! 10. Incremental Crawling ! ! 11. Startup/Shutdown ! ! 12. Packages and Dependencies ! ! ! ! --- 1,272 ---- ! ! $Id$ ! ! ------------------------------------------------------------------------------- ! III. The Crawler ! ! The crawler contains a special type of pipeline whose configuration is very ! limited. The reason is that the crawler parts use some shared data structures ! and contain some internal dependencies (e.g. the order in which different ! processing steps are done). Nevertheless we decided to keep up the pipeline ! paradigm to separate concerns into different classes and to avoid a large ! "Crawler" class that contains such different operations like Document fetching ! and robot exclusion. ! ! 1. The Fetcher: Crawl Requests and Crawler Output ! ! The Fetcher sits at the core of the crawler. It takes CrawlRequests and outputs ! raw CrawlRecords. ! ! A crawl request consists of the following fields: ! ! method: one of NEW or CHECK_FOR_RECRAWL [or CHECK_FOR_SERVER_RUNNING]. NEW ! loads a document as given by a URL. If network errors occur the fetcher can ! be parameterized to wait and retry a couple of times, set the host to ! BAD_STATE if unsuccessful, and output a SERVER_PROBLEM message. CHECK checks ! the document for changes (in the MD5). The crawler may behave differently when ! CHECK_FOR_RECRAWL or CHECK_FOR_SERVER_RUNNING was chosen. In the latter case ! the crawler may decide to check the host only once. In the case of CHECK an ! MD5 checksum has to be provided. [When using CHECK we don't look for changed ! Dates since they have proven to be unreliable.] ! ! url: URL: The URL of the document to crawl ! ! MD5Hash: MD5Hash: An MD5 Hash of the document, if method is CHECK ! ! interface CrawlRequest { ! ! // enum RequestMethod ! final static byte NEW = 1; ! final static byte CHECK_FOR_RECRAWL = 2; ! ! byte requestMethod; // type RequestMethod ! URL url; ! [long lastModified;] // if CHECK_FOR_RECRAWL, can be sent as ! If-Modified-Since] ! MD5Hash MD5Hash; // set to null if requestMethod == NEW ! } ! ! A CrawRecord is the output of a crawler that contains the raw document as loaded ! by the crawler threads. It contains the following fields: ! ! url: URL: The original URL of the crawl ! ! finalURL: URL: The final URL if HTTP responds with 30x result codes. The ! crawler can be configured a maximum number of detours to take if such a result ! code occurs. ! ! requestMethod: byte. request method as in CrawlRequest ! ! fingerprint: MD5Hash. hash value of the document contents. ! ! HTTPstatus: short. The HTTP status code as returned by the last try. e.g. 200 ! ! crawlerStatus: short. error code if not reflected through the HTTPStatus code ! ! MIMEType: String. The MIME type of the document loaded. e.g. "text/html" ! ! encoding: String. The document encoding if provided ! ! lastModified: Date. time when the doc was last changed. ! ! headers: the HTTP headers returned ! ! encoding: String. Content-Encoding as specified in a HTTP header ! ! contents: Object. Either a byte[] or a char[] depending on the MIME type and ! encoding. Since HTML or XML files themselves may contain an "encoding" ! attribute on their own the fetcher doesn't make any assumptions on the real ! content tyspe. ! ! interface CrawlRecord { ! ! // crawler status ! final static byte CS_OK 0 // if HTTPStatus == 200 ! final static byte CS_ERROR_IN_HTTP 1 // if HTTPStatus != 200 ! final static byte CS_TOO_MANY_REDIRECTS 2 // e.g. 301/302 redirect loop ! final static byte CS_UNKNOWN_HOST 3 // host name doesn't exist ! final static byte CS_HOST_NOT_REACHABLE 4 // server not running ! final static byte CS_READ_TIMEOUT 5 // server or network too slow ! final static byte CS_NO_ROUTE_TO_HOST 6 // network problem ! // (NoRouteToHostException) ! final static byte CS_PORT_CLOSED 7 // no server running on this ! // port (ConnectException) ! final static byte CS_FILE_TOO_LARGE 8 // file exceeded maximum size ! // and was truncated ! final static byte CS_IO_EXCEPTION 100 // unknown IO exception ! ! URL url; // -> IndexRecord.URI ! URL finalURL; // -> IndexRecord.secondaryURIs[0] ! byte requestMethod; // see above ! MD5Hash fingerprint; // -> IndexRecord.fingerprint ! short HTTPStatus; // HTTP status code ! byte crawlerStatus; // that are not reflected in HTTPStatus ! String MIMEType; // IndexRecord.MIMEType ! String[][] headers; // HTTP headers ! String encoding; // ISO, UTF, Base64, Gzip, etc. ! long lastModified; // same as in CrawlRequest if not modified, else timestamp ! byte[] contents; ! } ! ! The Fetcher is controlled by a FetcherManager that distributes CrawlRequests ! among different threads. The threads get batches of crawl requests if available ! to minimize synchronization. They can also be configured such that they collect ! a couple of documents before they put them to the output queue. ! ! We will use the hashCode of the hostname modulo the number of threads to assign ! fetches to the different threads. Each thread will have a priority queue and a ! small host name cache for the incoming requests. (for the start we will use ! Javas built-in host name cache). This way a thread can do its work without the ! need to communicate with or block other threads. ! ! The priority queue is used to keep hosts in a wait state while new hosts are ! crawled. Each time a page is crawled from a host it will come into a wait state ! for a configurable threshold until the next request is issued. ! ! [If implemented using non blocking-IO it may also be that a thread keeps ! downloading more than one host at once. This is presumably faster since it saves ! a lot of threads and with it the task switching overhead. The old IO also needs ! the data to be copied a couple of times. The best implementation still has to be ! figured out. Presumably a set of Fetcher threads that are responsible for a ! number of hosts and use non-blocking IO will show the best performance.] ! ! The Fetcher tries to be completely bound to network I/O and will not perform ! extractions if the content is compressed (that is, it sends an "accept-encoding: ! gzip" message if configured but will not perform a decompression step). ! ! The Fetcher also has to keep track of the hosts. Since it cannot hold infos ! about all hosts in RAM, a (LRU) caching mechanism has to be used that contains ! the following information for each host (HostInfo): ! ! hostName: String: a DNS name as an identifier ! IP-Address: InetAddrss: IP-Adress of this host ! ipExpires: long: Expiry time for the IP cache ! robots: : a data structure that is used by a RobotsTxtFilter ! robotsExpires: long: a time that defines when robots.txt has to be reloaded ! ! ! interface HostInfo { ! String hostName; ! InetAddress ipAddress; ! long ipExpires ! ? robots; ! long robotsExpires; ! } ! ! Since HostInfos are looked up using their hostNames they should be stored in a ! simple hash with the hostName as its key. ! ! From the caching point of view it is advisable that incoming CrawlRequests are ! not evenly distributed over the host name space. From a network efficiency point ! of view exactly this should be the case. This conflict may be resolved in the ! following way: Say a batch of CrawlRequests contains a maximum of 5000 hosts and ! a maximum of 100,000 requests. If one of these numbers is exceeded the batch is ! cut into several pieces that each obeys these rules. Then the HostInfo cache can ! be as large as the number of hosts in the batch and may only need to access ! secondary storage as a new batch is started. It can be implemented as a simple ! LRU cache. ! ! 3. DNS Handling ! ! Since DNS resolution takes a lot of time it is advisable to store ipAddresses of ! host names in the HostInfo structure. This calls for a URL implementation that ! doesn't do a resolution on its own, and an HTTP 1.1 implementation that can use ! the ipAddress as given in the HostInfo structure. ! ! For now we use Jakarta HTTPClient which doesn't do address resolution. ! Currently DNS resolution works the following way: ! a) a request to open a connection is sent to HTTPClient ! b) HTTPClient creates a new java.net.Socket with the host name as its argument ! c) if the host name is an IP address, Socket opens it directly. ! If it is a DNS name, ! d) Socket calls getCachedAddress() ! e) getCachedAddress will first perform a linear scan through its host name list ! to see whether resolved names have expired ! f) the host name is looked up in the cache. If it is not found, it is resolved ! through an internal Naming Service class and saved to the cache. ! Since e) takes linear time even when the name is in the cache it unnecessarily ! slows things down if we have 1000s of host names in the cache. In this case ! we would have to resolve the IP address for ourselves, or HTTPClient would have ! to do it, since it later needs the host name for sending an HTTP 1.1 request. ! ! ! 4. Robot Exclusion ! ! Since the incoming CrawlRequests may have been generated a long time ago the ! fetcher has to take care about changed robot exclusion policies while it is ! fetching the documents. For this sake a filter has to be applied shortly before ! a request is made to the server, and robots.txt files have to be reloaded before ! the first request to a server is made and after a specific time has elapsed. ! ! 6. Persistence ! ! CrawlRequests are usually performed in batches that are read from secondary ! storage. These files again may contain a large number of requests that are read ! in steps of <n> requests as specified in the config. Fast crawls demand for a ! large number of hosts in these files and for avoidance of the same hosts in ! subsequent URLs [see Shapenyuk/Suel]. ! ! CrawlRecords again are also written in batches of <n> records and are also ! distributed among several files. They may also be distributed among different ! directories in order to use NFS as a cheap distribution mechanism for the ! indexing step. ! ! ! 7. Distribution ! ! A Fetcher/FetcherManager combination can be distributed among different hosts if ! extracted links are divided such that a node is made responsible for a distinct ! set of hosts. The communication between different crawler nodes takes place in ! batches. To avoid a central component that distributes these Collections of ! CrawlRequests, each node has to know about the other nodes and which hosts this ! node addresses. ! It seems viable to use the hash value of the hostname of the URL to be crawled ! to split this up. But this is supposed to be done in a processing component like ! directly after link extraction. In the Shkabenyuk/Suel this is named the ! "crawling application". Thus it is not part of the crawler itself. ! ! For ease of use the crawler should adapt if a new crawler node is added. Say ! there are three nodes, and all crawl requests are divided into three queues that ! are distributed to these nodes. If a new node is started, the crawling ! application should get a message and start dividing the URLs into four pieces. ! On the other hand, if more than one crawling application is needed, the fetchers ! need to know where to send the downloaded files. This again could be divided by ! the URL. A similar mechanism should apply. ! ! 8. Configuration ! ! - FetcherManager ! - method: NIO or old IO ! - number of threads ! - NIO: number of concurrent requests (=concurrent hosts) per thread ! - number of seconds between subsequent requests to a host ! - number of redirects to follow after page is quit with TOO_MANY_REDIRECTS ! - maximum file size ! - number of seconds to wait for a server to send the file completely ! - HTTP User Agent String ! - size of host name cache ! - size of temp cache for loading docs ! - use "Accept-Encoding: gzip [Compress, Deflate?]" ! ! 9. From CrawlRecords to IndexRecords ! ! Crawl- and IndexRecords seem to be pretty similar, but in fact they differ in a ! variety of features. ! An IndexRecord is crawl-agnostic. It is used for different document sources and ! thus doesn't know about HTTP status codes and the like. ! ! [There will be a converter between Crawl- and IndexRecords at some time in the ! pipeline. This will be configurable such that CrawlRecord entries may become ! generic fields within an IndexRecord] ! ! 9. Log Files ! ! 10. Incremental Crawling ! ! 11. Startup/Shutdown ! ! 12. Packages and Dependencies ! ! ! ! Index: framework.txt =================================================================== RCS file: /cvsroot/larm/larm/docs/framework.txt,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** framework.txt 30 Jun 2003 14:19:36 -0000 1.3 --- framework.txt 24 Jul 2003 12:18:17 -0000 1.4 *************** *** 1,316 **** ! ! $Id$ ! ! ------------------------------------------------------------------------------- ! Part I. Framework ! ------------------------------------------------------------------------------- ! ! ! I. Configuration ! ! Configuration drove the first discussions about LARM since it was a major ! weakness of the old crawler that these issues hadn't been properly addressed. ! ! In general, several options exist: ! 1. Use a property file ! 2. Use an XML file ! 3. Use several XML files and separate pipeline construction and parameterization ! 4. Use configuration messages that are passed through the pipelines and allow ! for reconfiguring it at runtime. ! The fourth option would be nice if the crawler should be controlled via a web ! interface or the like. The third one resembles the Avalon Phoenix model, ! although it is not sure if that really does the same. ! ! After the discussion we came to the conclusion that ! - Java property files are too restricted to model pipelines ! - Avalon seems to be overkill and contradicts KISS ! ! Nevertheless we use some of the Avalon ideas, namely: ! - A component initializes its subcomponents by calling a configure() method. ! Maybe other lifecycle mehtods may be implemented as well. ! - configure() gets its part of the configuration file. It is up to the enclosing ! component to cut out the right part (using the class below and XPath) ! ! ! 1. XML Configuration ! ! At this time we use a single XML file to form and configure the pipelines. ! ! Configuration is done through a single class that wraps a DOM represenation of ! the XML and facilitates access through XPath. ! ! Currently the interface looks like this: ! ! class Configuration ! { ! Configuration(Reader config); ! Configuration getSubConfig(String xpath); ! ! String getPropertyAsString(String xpath); ! X getPropertyAsX(String xpath); ! ! Node getCurrentNode(); // can we hide this? ! Node getNode(String xpath); // can we hide this? ! NodeList getNodes(String xpath); // can we hide this? ! } ! ! Configuration can resolve strings like ${my.property} to a system property or ! something like $${/my/xpath/} to an xpath expression from the current file. ! ! [Remark] ! ! The LARM main program analyzes the following subsections: ! <properties> <pipes> and <sources> ! ! The properties section is similar to ANT's properties section. Its contents is ! read at startup time. Dependencies are resolved when a property is used (i.e. ! resolved by an underlying component). ! ! The <pipes> and <sources> sections are passed to two global class instances (in ! Avalon they would be called blocks): The config.PropertyManager, ! pipes.PipeManager and the sources.SourceManager. ! ! Each of these classes initializes its subcomponents in the same way they are ! initialized. This is very similar to Avalon's Inversion of Control pattern ! (IoC): ! ! All pipeline classes (PipeManager, SourceManager, Source, MessageProcessor, ! etc.) have or can have a method called "configure(Configuration c)", derived ! from a lifecycle interface called config.Configurable. ! ! ! 2. Startup/Shutdown ! ! LARM gets the path to an XML configuration file as a parameter. Different server ! modes depend on the sources and pipeline configurations in these files. ! ! Startup should be something like ! ! java larm.root.LARM <configfile> ! ! LARM then ! 1. resolves properties in the <properties> section through ! config.PropertyManager ! 2. initialises the pipelines and registers them ! 3. initialises the sources and registers them ! 4. passes the registry of sources and pipes to the classes implementing ! the framework.Contextualizable interface ! 5. calls "configure" on each of the pipelines (through PipeManager.configure()) ! 6. calls "configure" on each of the sources (through SourceManager.configure()) ! 7. calls "start" on the nonblocking pipes (through PipeManager.start()) ! 8. calls "start" on the nonblocking sources (through SourceManager.start()) ! ! ! When is LARM shut down? Since pipelines naturally wait for incoming messages, ! this depends on the nature of the Sources and other services. For development we ! will most likely use sources that run through a directory, emit the messages ! contained to a pipeline, and shut down. That means the source may signal that ! the application should exit. Since it is likely that the pipelines are still at ! work, the app will have to wait until all messages are consumed and processed. ! ! [There may be other services that may call for a shutdown: a CTRL-C handler or a ! web service interface.] ! ! ! II. Messaging Framework ! ! LARM basically is concerned with processing pieces of data and moving it along ! what we call a processing pipeline. ! ! The pipeline framework is a set of classes that simplifies this task: It allows ! for a separation of different assembly parts of the whole system. That way ´ ! different parts of the pipeline can be put into different classes and can be ! developed rather independently. ! ! In contrast to message-queue systems it is a low-level in-process framework: If ! it is known that only thread is involved, the components need not even be thread ! safe. The aim is to be able to process a very large number of small messages ! very rapidly. ! ! ! 1 Active and Passive Components ! ! Active Components run in their own thread. They may respond to external events ! (socket calls, timer events or whatsoever). Passive Components just provide ! services to other passive or active components. Sources (see below) will mostly ! be active components. That is, they operate the subsequent pipeline. ! ! A MessageProcessor (MP) is a simple class that is called by a pipeline to handle ! a message. It may alter the message, filter it, save it somewhere, etc. It ! either returns null (forming a message sink) or it returns the message (most of ! the time the same message it got, but it may also return a different one). ! Examples of an MP would be a RobotExclusionFilter (filtering some of the URLs ! from the URL list), a PDF to XML converter (reducing PDF to a common metaformat ! that is understood by the indexing component), a FileSystemStorage that saves ! incoming documents on disk, a JMSStorage that saves them to a message queue, or ! a LuceneStorage that adds a document to a Lucene index. An MP could as well ! contain a BlockingPipeline (see below) forming a nested pipeline. ! ! [Is the storage a required part of the pipeline? If so I think we should break ! it up into more distinct pieces to there can be some control programmatically. ! If not is there a required order?] ! ! 2. MessagePipelines ! ! MessageProcessors are put together into message pipelines. There are two types ! of them: BlockingPipelines and NonblockingPipelines. ! ! Pipelines process objects of type Message: ! ! interface Message implements Serializable ! {} ! ! You can see that this is a very generic concept. Its behavior only depends on ! the processor implementations. Messages have to be serializable since they will ! mostly stay on disk. ! Messages should only be data containers and should not contain business logic ! or be dependent on types other than primitive types, Collections, or strings. ! Objects included in the message should form a part-of relationship, no ! referential relationship, since this would make serialization and ! deserialization much more complicated. ! ! Messages are put into pipelines: ! ! interface Pipeline implements MessageProcessor ! { ! public Message processMessage(Message); ! public Message processMessages(Collection); // Collection<Message> ! } ! ! There are two types of pipelines: BlockingPipelines and NonblockingPipelines. ! ! A BlockingPipeline processes a message by calling (at most) all of its ! MessageProcessors in a row. A MessageProcessor gets the Message, may alter it, ! and returns it again. The reference returned is passed to the next processor in ! the row. After the last MP the resulting message is returned. ! ! BlockingPipeline may be designed not to be thread safe (i.e. because it is used ! from within an NonBlockingPipeline and thus only accessed by one thread), as do ! the MPs. (A BlockingPipeline may as well be an MP, which allows for nesting ! pipelines). ! ! A NonBlockingPipeline has an extra thread that handles the messages. Therefore, ! at a processMessage() call, the message is written into a message queue and ! always returns null. The processor thread handles all messages until the queue ! is empty. Internally the AsynchronousProcessor consists of a BlockingPipeline ! that is operated by the ProcessorThread. ! ! The Queue implementation will usually be an in-memory FIFO queue, but may be ! exchanged depending on the needs. A queue may block if it is full. ! ! The MessageProcessor interface looks like this: ! ! interface MessageProcessor ! { ! public Message processMessage(Message); ! } ! ! If you implement a MessageProcessor and need lifecycle methods, you can ! implement one or more of the interfaces larm.config.Configurable, ! larm.framework.Contextualizable, or larm.framework.Startable. The Pipeline will ! take care to call the methods contained in these interfaces in the order as ! specified in section I. ! ! Configuration: ! ! pipeline parameters: ! ! - @name: A global name that identifies the pipe. If existent, the pipeline ! will be registered within the PipelineManager. ! - processors: A block of processors. They are put into the pipeline in the ! order in which they are specified in the configuration. ! ! Additionally, NonblockingPipeline has the following parameters: ! ! - @queueSize: integer. number of messages the queue is able to handle. If there ! are more messages, a call to putMessage() will block until all messages are ! fed into the queue ! - queue (optional): sets a different queue implementation than the default ! larm.pipes.InMemoryQueue. ! Parameter: @type: A class name of type larm.pipes.Queue that is used for the ! queue. May contain config parameters in the block ! ! Example: ! ! <blockingPipeline name="pipe1"> ! <processors> ! <processor type="larm.processors.DoNothingProcessor"> ! <someArg>someVal</someArg> ! </processor> ! </processors> ! </blockingPipeline> ! ! <nonBlockingPipeline name="pipe2" queueSize="100"> ! <queue type="myPackage.myQueue"> ! <myQueueParameter/> ! </queue> ! <processors> ! <processor type="larm.processors.DoNothingProcessor"> ! <someArg>someVal</someArg> ! </processor> ! <processor type="larm.processors.MyFancyProcessor"/> ! </processors> ! </blockingPipeline> ! ! 3. Sources and Drains ! ! Sources are classes that actively pump new messages into a pipeline. In the ! simplest case a source loads a file given as a parameter, puts it into a ! pipeline, and exits. ! ! Framework provides the following sources: ! ! - FileSource ! reads messages from a given file or a set of files, puts them into the ! pipeline, and exits. ! The file must be a valid batch. ! ! parameters: ! - fileName: The file to read ! - fileSet: a file set like in ANT. Describes files to be put in to ! the queue ! - pipeline: The name of the pipeline ! - delete true/false: delete file(s) after they were put to the pipe. ! ! ! xml config example: ! <fileSource> ! <fileName>c:/larm/test/file1</fileName> ! <pipeline>testPipe</pipeline> ! </fileSource> ! ! example 2: ! <fileSource> ! <fileset dir="c:/larm/test/*.lst"/> ! <pipeline>testPipe</pipeline> ! </fileSource> ! ! - FileMonitorSource ! monitors a set of files given by the fileset parameter and looks for changes ! in the set described by the fileset pattern. ! when new files are found, they are appended to an internal in-memory queue. ! These files are then put into the pipeline given, deleted, and deleted from ! the internal in-memory queue. ! ! parameters: ! - fileset ! - delay: time (in seconds) between runs of the monitor ! ! <fileMonitorSource> ! <fileset dir="somedir"/> ! <pipeline>testPipe</pipeline> ! <delay>30 s</delay> ! </fileMonitorSource> ! ! 4. Notifications or Poll[ing] ! ! ! ! 5. Batch file operation ! ! A batch file contains a set of Objects inherited from the type Message. They are ! read in blocks ! ! [6. Batch file indexing] ! ! --- 1,316 ---- ! ! $Id$ ! ! ------------------------------------------------------------------------------- ! Part I. Framework ! ------------------------------------------------------------------------------- ! ! ! I. Configuration ! ! Configuration drove the first discussions about LARM since it was a major ! weakness of the old crawler that these issues hadn't been properly addressed. ! ! In general, several options exist: ! 1. Use a property file ! 2. Use an XML file ! 3. Use several XML files and separate pipeline construction and parameterization ! 4. Use configuration messages that are passed through the pipelines and allow ! for reconfiguring it at runtime. ! The fourth option would be nice if the crawler should be controlled via a web ! interface or the like. The third one resembles the Avalon Phoenix model, ! although it is not sure if that really does the same. ! ! After the discussion we came to the conclusion that ! - Java property files are too restricted to model pipelines ! - Avalon seems to be overkill and contradicts KISS ! ! Nevertheless we use some of the Avalon ideas, namely: ! - A component initializes its subcomponents by calling a configure() method. ! Maybe other lifecycle mehtods may be implemented as well. ! - configure() gets its part of the configuration file. It is up to the enclosing ! component to cut out the right part (using the class below and XPath) ! ! ! 1. XML Configuration ! ! At this time we use a single XML file to form and configure the pipelines. ! ! Configuration is done through a single class that wraps a DOM represenation of ! the XML and facilitates access through XPath. ! ! Currently the interface looks like this: ! ! class Configuration ! { ! Configuration(Reader config); ! Configuration getSubConfig(String xpath); ! ! String getPropertyAsString(String xpath); ! X getPropertyAsX(String xpath); ! ! Node getCurrentNode(); // can we hide this? ! Node getNode(String xpath); // can we hide this? ! NodeList getNodes(String xpath); // can we hide this? ! } ! ! Configuration can resolve strings like ${my.property} to a system property or ! something like $${/my/xpath/} to an xpath expression from the current file. ! ! [Remark] ! ! The LARM main program analyzes the following subsections: ! <properties> <pipes> and <sources> ! ! The properties section is similar to ANT's properties section. Its contents is ! read at startup time. Dependencies are resolved when a property is used (i.e. ! resolved by an underlying component). ! ! The <pipes> and <sources> sections are passed to two global class instances (in ! Avalon they would be called blocks): The config.PropertyManager, ! pipes.PipeManager and the sources.SourceManager. ! ! Each of these classes initializes its subcomponents in the same way they are ! initialized. This is very similar to Avalon's Inversion of Control pattern ! (IoC): ! ! All pipeline classes (PipeManager, SourceManager, Source, MessageProcessor, ! etc.) have or can have a method called "configure(Configuration c)", derived ! from a lifecycle interface called config.Configurable. ! ! ! 2. Startup/Shutdown ! ! LARM gets the path to an XML configuration file as a parameter. Different server ! modes depend on the sources and pipeline configurations in these files. ! ! Startup should be something like ! ! java larm.root.LARM <configfile> ! ! LARM then ! 1. resolves properties in the <properties> section through ! config.PropertyManager ! 2. initialises the pipelines and registers them ! 3. initialises the sources and registers them ! 4. passes the registry of sources and pipes to the classes implementing ! the framework.Contextualizable interface ! 5. calls "configure" on each of the pipelines (through PipeManager.configure()) ! 6. calls "configure" on each of the sources (through SourceManager.configure()) ! 7. calls "start" on the nonblocking pipes (through PipeManager.start()) ! 8. calls "start" on the nonblocking sources (through SourceManager.start()) ! ! ! When is LARM shut down? Since pipelines naturally wait for incoming messages, ! this depends on the nature of the Sources and other services. For development we ! will most likely use sources that run through a directory, emit the messages ! contained to a pipeline, and shut down. That means the source may signal that ! the application should exit. Since it is likely that the pipelines are still at ! work, the app will have to wait until all messages are consumed and processed. ! ! [There may be other services that may call for a shutdown: a CTRL-C handler or a ! web service interface.] ! ! ! II. Messaging Framework ! ! LARM basically is concerned with processing pieces of data and moving it along ! what we call a processing pipeline. ! ! The pipeline framework is a set of classes that simplifies this task: It allows ! for a separation of different assembly parts of the whole system. That way ´ ! different parts of the pipeline can be put into different classes and can be ! developed rather independently. ! ! In contrast to message-queue systems it is a low-level in-process framework: If ! it is known that only thread is involved, the components need not even be thread ! safe. The aim is to be able to process a very large number of small messages ! very rapidly. ! ! ! 1 Active and Passive Components ! ! Active Components run in their own thread. They may respond to external events ! (socket calls, timer events or whatsoever). Passive Components just provide ! services to other passive or active components. Sources (see below) will mostly ! be active components. That is, they operate the subsequent pipeline. ! ! A MessageProcessor (MP) is a simple class that is called by a pipeline to handle ! a message. It may alter the message, filter it, save it somewhere, etc. It ! either returns null (forming a message sink) or it returns the message (most of ! the time the same message it got, but it may also return a different one). ! Examples of an MP would be a RobotExclusionFilter (filtering some of the URLs ! from the URL list), a PDF to XML converter (reducing PDF to a common metaformat ! that is understood by the indexing component), a FileSystemStorage that saves ! incoming documents on disk, a JMSStorage that saves them to a message queue, or ! a LuceneStorage that adds a document to a Lucene index. An MP could as well ! contain a BlockingPipeline (see below) forming a nested pipeline. ! ! [Is the storage a required part of the pipeline? If so I think we should break ! it up into more distinct pieces to there can be some control programmatically. ! If not is there a required order?] ! ! 2. MessagePipelines ! ! MessageProcessors are put together into message pipelines. There are two types ! of them: BlockingPipelines and NonblockingPipelines. ! ! Pipelines process objects of type Message: ! ! interface Message implements Serializable ! {} ! ! You can see that this is a very generic concept. Its behavior only depends on ! the processor implementations. Messages have to be serializable since they will ! mostly stay on disk. ! Messages should only be data containers and should not contain business logic ! or be dependent on types other than primitive types, Collections, or strings. ! Objects included in the message should form a part-of relationship, no ! referential relationship, since this would make serialization and ! deserialization much more complicated. ! ! Messages are put into pipelines: ! ! interface Pipeline implements MessageProcessor ! { ! public Message processMessage(Message); ! public Message processMessages(Collection); // Collection<Message> ! } ! ! There are two types of pipelines: BlockingPipelines and NonblockingPipelines. ! ! A BlockingPipeline processes a message by calling (at most) all of its ! MessageProcessors in a row. A MessageProcessor gets the Message, may alter it, ! and returns it again. The reference returned is passed to the next processor in ! the row. After the last MP the resulting message is returned. ! ! BlockingPipeline may be designed not to be thread safe (i.e. because it is used ! from within an NonBlockingPipeline and thus only accessed by one thread), as do ! the MPs. (A BlockingPipeline may as well be an MP, which allows for nesting ! pipelines). ! ! A NonBlockingPipeline has an extra thread that handles the messages. Therefore, ! at a processMessage() call, the message is written into a message queue and ! always returns null. The processor thread handles all messages until the queue ! is empty. Internally the AsynchronousProcessor consists of a BlockingPipeline ! that is operated by the ProcessorThread. ! ! The Queue implementation will usually be an in-memory FIFO queue, but may be ! exchanged depending on the needs. A queue may block if it is full. ! ! The MessageProcessor interface looks like this: ! ! interface MessageProcessor ! { ! public Message processMessage(Message); ! } ! ! If you implement a MessageProcessor and need lifecycle methods, you can ! implement one or more of the interfaces larm.config.Configurable, ! larm.framework.Contextualizable, or larm.framework.Startable. The Pipeline will ! take care to call the methods contained in these interfaces in the order as ! specified in section I. ! ! Configuration: ! ! pipeline parameters: ! ! - @name: A global name that identifies the pipe. If existent, the pipeline ! will be registered within the PipelineManager. ! - processors: A block of processors. They are put into the pipeline in the ! order in which they are specified in the configuration. ! ! Additionally, NonblockingPipeline has the following parameters: ! ! - @queueSize: integer. number of messages the queue is able to handle. If there ! are more messages, a call to putMessage() will block until all messages are ! fed into the queue ! - queue (optional): sets a different queue implementation than the default ! larm.pipes.InMemoryQueue. ! Parameter: @type: A class name of type larm.pipes.Queue that is used for the ! queue. May contain config parameters in the block ! ! Example: ! ! <blockingPipeline name="pipe1"> ! <processors> ! <processor type="larm.processors.DoNothingProcessor"> ! <someArg>someVal</someArg> ! </processor> ! </processors> ! </blockingPipeline> ! ! <nonBlockingPipeline name="pipe2" queueSize="100"> ! <queue type="myPackage.myQueue"> ! <myQueueParameter/> ! </queue> ! <processors> ! <processor type="larm.processors.DoNothingProcessor"> ! <someArg>someVal</someArg> ! </processor> ! <processor type="larm.processors.MyFancyProcessor"/> ! </processors> ! </blockingPipeline> ! ! 3. Sources and Drains ! ! Sources are classes that actively pump new messages into a pipeline. In the ! simplest case a source loads a file given as a parameter, puts it into a ! pipeline, and exits. ! ! Framework provides the following sources: ! ! - FileSource ! reads messages from a given file or a set of files, puts them into the ! pipeline, and exits. ! The file must be a valid batch. ! ! parameters: ! - fileName: The file to read ! - fileSet: a file set like in ANT. Describes files to be put in to ! the queue ! - pipeline: The name of the pipeline ! - delete true/false: delete file(s) after they were put to the pipe. ! ! ! xml config example: ! <fileSource> ! <fileName>c:/larm/test/file1</fileName> ! <pipeline>testPipe</pipeline> ! </fileSource> ! ! example 2: ! <fileSource> ! <fileset dir="c:/larm/test/*.lst"/> ! <pipeline>testPipe</pipeline> ! </fileSource> ! ! - FileMonitorSource ! monitors a set of files given by the fileset parameter and looks for changes ! in the set described by the fileset pattern. ! when new files are found, they are appended to an internal in-memory queue. ! These files are then put into the pipeline given, deleted, and deleted from ! the internal in-memory queue. ! ! parameters: ! - fileset ! - delay: time (in seconds) between runs of the monitor ! ! <fileMonitorSource> ! <fileset dir="somedir"/> ! <pipeline>testPipe</pipeline> ! <delay>30 s</delay> ! </fileMonitorSource> ! ! 4. Notifications or Poll[ing] ! ! ! ! 5. Batch file operation ! ! A batch file contains a set of Objects inherited from the type Message. They are ! read in blocks ! ! [6. Batch file indexing] ! ! Index: indexer.txt =================================================================== RCS file: /cvsroot/larm/larm/docs/indexer.txt,v retrieving revision 1.5 retrieving revision 1.6 diff -C2 -d -r1.5 -r1.6 *** indexer.txt 30 Jun 2003 14:19:36 -0000 1.5 --- indexer.txt 24 Jul 2003 12:18:17 -0000 1.6 *************** *** 1,228 **** ! ! $Id$ ! ! ------------------------------------------------------------------------------- ! Part IV. The Indexer ! ------------------------------------------------------------------------------- ! ! The indexer is a simple component that gets messages of type IndexRecord from a ! queue and outputs them to an index. Our implementation will use a Lucene index ! for this task, although other search engines could be used as well. ! ! Usually the IndexRecords are provided in batches which may reside in files of ! IndexRecord objects. A BatchFileSource can be used to monitor a directory for ! new batch files. ! ! For each IndexRecord, the Indexer gets an IndexRecord that contains the ! following fields: ! ! command: byte: ADD, [UPDATE], or DELETE. Defines if the IndexRecord should be ! added, updated, or deleted from the index. (UPDATE may not be necessary since ! an ADD with the same PrimaryURI may automatically perform an UPDATE) ! ! primaryURI: URI: primary URI of the IndexRecord. If the IndexRecord comes from ! the web or a file system, this is simply the URL. If it represents a tuple ! from a database, the provider has to come up with a URN that forms a primary ! key for the IndexRecord. ! ! Since web documents may be accessible under different URLs a mechanism has to be ! provided to find a primary URL, e.g. by using the one with the highest number of ! inlinks. ! ! In case of ADD or UPDATE the following information has to be provided: ! ! secondaryURIs: Collection: A list of secondary URIs of the IndexRecord. If the ! URI is ambiguous (e.g. if a document is represented by more than one URL) this ! ! MD5Hash: MD5Hash: The MD5 hash of the doc. In case of a recrawl this hash will ! be sent to the gatherer to determine whether the IndexRecords contents have ! changed. ! ! lastChangedDate: Date: The time this indexing has occurred. In case of a crawler ! the time the document was fetched. ! ! documentWeight: float. It is left to the processing pipeline to set this field ! accordingly, e.g. by analyzing the document-link-graph. ! ! MIMEtype: String. The MIME type of the original document ! ! fields: A Collection of <fieldname: String, fieldweight: float, value: ! [LargeText], methods: byte, fieldType: byte> describing the document content. ! They will be indexed as-is. "flags" can be one or more from <INDEXED, STORED, ! TOKENIZED>. fieldType is one of <TEXT, DATE> ! ! The exact contents of these fields is specified through the RecordProcessors. ! Usually they will contain a step in which binary content (PDFs etc) is converted ! to text, a step in which documents are split up into different fields (e.g. ! title, header, headings, body) ! ! The indexer then performs the analysis of different fields and splits the field ! up into index tokens using the standard Lucene analysers infrastructure. ! ! The following shows Java interfaces for the type described. Remarks show a ! possible implementation using J2SDK 1.5 (Tiger): ! ! interface IndexRecord implements Message ! { ! // enum Command ! final static byte CMD_ADD = (byte)'a'; ! final static byte CMD_UPDATE = (byte)'u'; ! final static byte CMD_DELETE = (byte)'d'; ! ! byte command; // type: Co... [truncated message content] |
From: <ot...@us...> - 2003-07-24 12:18:20
|
Update of /cvsroot/larm/larm In directory sc8-pr-cvs1:/tmp/cvs-serv31326 Modified Files: README.txt update.website.txt Log Message: - Updated. Index: README.txt =================================================================== RCS file: /cvsroot/larm/larm/README.txt,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** README.txt 24 Jun 2003 17:22:05 -0000 1.2 --- README.txt 24 Jul 2003 12:18:17 -0000 1.3 *************** *** 1,27 **** ! ! $Id$ ! ! LARM - Lucene Advanced Retrieval Machine. ! ! ! Please read the docs in docs/ first, starting with contents.txt ! ! ! You can use ANT to build the project. Simply call ! ! ant ! ! and LARM will build into ./build. The first time it builds it will look for ! external libraries like HTTPClient and download them if necessary. ! ! LARM was written using the Eclipse IDE. As soon as the libraries are downloaded, ! You can import the workspace simply by importing the checked out directory into ! the eclipse workspace. Eclipse will build the project into the ./build directory. ! ! To run LARM you will be able to use larm.root.LARM. As soon as this is written ! you can call ! ! java -classpath ./build/classes:[other libs] larm.root.LARM <configfile> ! ! We will provide config files for various applications. ! --- 1,40 ---- ! ! $Id$ ! ! LARM - Lucene Advanced Retrieval Machine. ! ! Please read the docs in docs/ first, starting with contents.txt ! ! Prerequisites: You need ! - Java 2 SDK starting from 1.4.0 ! - ANT (see http://jakarta.apache.org/ant) ! ! You can use ANT to build the project. Simply call ! ! ant ! ! and LARM will build into ./build. [The first time it builds it will look for ! external libraries like HTTPClient and downloads them if necessary.] ! ! LARM was written using the Eclipse IDE. As soon as the libraries are downloaded, ! You can import the workspace simply by importing the checked out directory into ! the eclipse workspace. Eclipse will build the project into the ./build directory. ! ! To run LARM you will be able to use larm.root.LARM. ! ! You can use ! ! larm.bat empty ! or ! larm.sh empty ! ! to run LARM with an empty test pipeline in src/config/empty.xml. ! This is just for testing purposes. ! ! We will provide config files for various applications in src/config. Runtime ! arguments will have to be passed as system properties or in a properties file ! that is referenced from the config file. ! ! ! ! Index: update.website.txt =================================================================== RCS file: /cvsroot/larm/larm/update.website.txt,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** update.website.txt 30 Jun 2003 14:17:25 -0000 1.1 --- update.website.txt 24 Jul 2003 12:18:17 -0000 1.2 *************** *** 1,8 **** ! In order to update the website... ! ! - check out the module "website" from LARM's CVS ! - edit the files in htdocs/ ! - commit the changes ! - log in via SSH to shell.sourceforge.net ! - cd to /home/groups/l/la/larm ! - perform a cvs -d :ext:<user>@cvs1:/cvsroot/larm co website --- 1,8 ---- ! In order to update the website... ! ! - check out the module "website" from LARM's CVS ! - edit the files in htdocs/ ! - commit the changes ! - log in via SSH to shell.sourceforge.net ! - cd to /home/groups/l/la/larm ! - perform a cvs -d :ext:<user>@cvs1:/cvsroot/larm co website |
From: <ot...@us...> - 2003-07-24 11:48:19
|
Update of /cvsroot/larm/larm/src/java/larm/config In directory sc8-pr-cvs1:/tmp/cvs-serv26943/src/java/larm/config Added Files: PropertyManager.java Log Message: - Initial checkin. --- NEW FILE: PropertyManager.java --- /* * * $Id: PropertyManager.java,v 1.1 2003/07/24 11:48:08 otis Exp $ */ package larm.config; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.util.Properties; import java.util.logging.Level; import java.util.logging.Logger; import larm.framework.config.*; /** * PropertyManager * * @author */ public class PropertyManager implements Configurable { static Logger log = Logger.getLogger(PropertyManager.class.getName());; public PropertyManager() { } /** * @see larm.config.Configurable#configure(larm.config.Configuration) */ public void configure(Configuration conf) { log.info("configuring PropertyManager"); ConfigList props = conf.getSubConfigList("property[@name and @value]"); for(int i = 0; i < props.length(); i++) { Configuration p = props.item(i); String name = p.getPropertyAsStringDontResolve("@name"); String value = p.getPropertyAsStringDontResolve("@value"); log.info("found property: " + name + "=" + value); System.setProperty(name, value); } props = conf.getSubConfigList("property[@file]"); for(int i = 0; i < props.length(); i++) { String fileName=null; try { Configuration p = props.item(i); fileName = p.getProperty("@file"); log.info("found property file: " + fileName); Properties properties = new Properties(); properties.load(new FileInputStream(fileName)); System.setProperties(properties); } catch(FileNotFoundException e) { log.config("Could not find property file '" + fileName + "'"); } catch(IOException e) { log.log(Level.SEVERE, "I/O Exception while opening property file " + fileName, e); } } log.exiting("","configure"); } } |