larm-cvs Mailing List for Lucene Advanced Retrieval Machine (LARM)
Brought to you by:
cmarschner,
otis
You can subscribe to this list here.
| 2003 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
(31) |
Jul
(25) |
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2011 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
(1) |
Jul
|
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
|
From: <ot...@us...> - 2003-07-29 16:01:29
|
Update of /cvsroot/larm/larm
In directory sc8-pr-cvs1:/tmp/cvs-serv24630
Added Files:
larm.bat larm.sh build.xml
Log Message:
- Initial checkin.
--- NEW FILE: larm.bat ---
java -classpath build/classes larm.root.LARM src/config/%1.xml
--- NEW FILE: larm.sh ---
java -classpath build/classes larm.root.LARM src/config/$1.xml
--- NEW FILE: build.xml ---
<project name="larm" default="compile" basedir=".">
<property file="${basedir}/build.properties"/> <!-- Component local -->
<property name="lib.dir" value="./lib"/>
<!-- The current version number of this component -->
<property name="app.title" value="LARM"/>
<property name="app.version" value="V0.0"/>
<!-- The base directory for component sources -->
<property name="source.home" value="src"/>
<!-- The base directory for unit test sources -->
<property name="test.home" value="src/test"/>
<!-- The base directory for compilation targets -->
<property name="build.home" value="build"/>
<!-- The base directory for distribution targets -->
<property name="dist.home" value="dist"/>
<!-- Should Java compilations set the 'debug' compiler option? -->
<property name="compile.debug" value="true"/>
<!-- Should Java compilations set the 'deprecation' compiler option? -->
<property name="compile.deprecation" value="true"/>
<!-- Should Java compilations set the 'optimize' compiler option? -->
<property name="compile.optimize" value="true"/>
<!-- Construct compile classpath -->
<path id="base.classpath">
<pathelement location="${build.home}/classes"/>
</path>
<path id="compile.classpath">
<pathelement location="${build.home}/classes"/>
<pathelement location="${junit.jar}"/>
</path>
<path id="test.classpath">
<pathelement location="${build.home}/classes"/>
<pathelement location="${build.home}/tests"/>
<pathelement location="${junit.jar}"/>
<pathelement location="${xalan.jar}"/>
</path>
<!-- The root test to execute -->
<property name="test.runner" value="junit.swingui.TestRunner"/>
<property name="test.entry" value="larm.AllTests"/>
<!-- ========== Targets: "Internal" Targets =============================== -->
<target name="init"
description="Initialize and evaluate conditionals">
<echo message="-------- ${app.title} ${app.version} --------"/>
</target>
<target name="prepare" depends="init"
description="Prepare build directory">
<mkdir dir="${build.home}"/>
<mkdir dir="${build.home}/classes"/>
<mkdir dir="${build.home}/tests"/>
</target>
<!-- ========== Targets: "External" Targets =============================== -->
<target name="dist" depends="compile,doc"
description="Create binary distribution">
<mkdir dir="${dist.home}"/>
<mkdir dir="${dist.home}"/>
<jar jarfile ="${dist.home}/larm.jar"
basedir ="${build.home}/classes"
manifest ="${build.home}/conf/MANIFEST.MF">
<metainf dir="${dist.home}">
<include name="LICENSE.txt"/>
</metainf>
</jar>
</target>
<!-- ========== Targets: "External" Targets: Clean-up ===================== -->
<target name="clean"
description="Clean build and distribution directories">
<delete dir="${build.home}"/>
<delete dir="${dist.home}"/>
</target>
<target name="all" depends="clean,compile"
description="Clean and compile all components"/>
<!-- ========== Targets: "External" Targets: Compilation ================== -->
<target name="compile" depends="prepare"
description="Compile shareable components">
<javac srcdir ="${source.home}/java"
destdir ="${build.home}/classes"
debug ="${compile.debug}"
source ="1.4"
deprecation ="${compile.deprecation}"
optimize ="${compile.optimize}">
<classpath refid="compile.classpath"/>
</javac>
</target>
<target name="compile.tests" depends="compile"
description="Compile unit test cases">
<javac srcdir ="${test.home}"
destdir ="${build.home}/tests"
debug ="${compile.debug}"
source ="1.4"
deprecation ="${compile.deprecation}"
optimize ="${compile.optimize}">
<classpath refid="test.classpath"/>
</javac>
<copy todir="${build.home}/tests" filtering="on">
<fileset dir="${test.home}" excludes="**/*.java"/>
</copy>
</target>
<!-- ========== Targets: "External" Targets: Testing ====================== -->
<target name="test" depends="compile.tests" if="test.entry"
description="Run all unit test cases">
<java classname="${test.runner}" fork="yes" failonerror="${test.failonerror}">
<jvmarg value="-Djava.protocol.handler.pkgs=${java.protocol.handler.pkgs}"/>
<jvmarg value="-Dorg.apache.commons.logging.Log=${httpclient.test.log}"/>
<jvmarg value="-Dhttpclient.test.webappContext=${httpclient.test.webappContext}" />
<arg value="${test.entry}"/>
<classpath refid="test.classpath"/>
</java>
</target>
<!-- ========== Targets: "External" Targets: Documenation ================= -->
<target name="doc" depends="javadoc"
description="Create component documentation.">
<mkdir dir="${dist.home}"/>
<mkdir dir="${dist.home}/docs"/>
<copy todir="${dist.home}/docs" filtering="off">
<fileset dir="docs"/>
</copy>
</target>
<target name="javadoc" depends="compile"
description="Create component Javadoc documentation">
<mkdir dir="${dist.home}"/>
<mkdir dir="${dist.home}/docs"/>
<mkdir dir="${dist.home}/docs/api"/>
<javadoc sourcepath ="${source.home}/java"
destdir ="${dist.home}/docs/api"
packagenames ="larm.*"
author ="true"
protected ="true"
version ="true"
source="1.4"
doctitle ="<h1>${app.title}</h1>"
windowtitle ="${app.title} (Version ${app.version})"
>
</javadoc>
</target>
</project>
|
|
From: <ot...@us...> - 2003-07-29 15:15:13
|
Update of /cvsroot/larm/larm/src/java/larm/pipes In directory sc8-pr-cvs1:/tmp/cvs-serv24906/src/java/larm/pipes Modified Files: PipelineManager.java Log Message: - Small fix. Index: PipelineManager.java =================================================================== RCS file: /cvsroot/larm/larm/src/java/larm/pipes/PipelineManager.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** PipelineManager.java 29 Jul 2003 15:10:40 -0000 1.1 --- PipelineManager.java 29 Jul 2003 15:15:10 -0000 1.2 *************** *** 1,7 **** - /* - * Created on 30.06.2003 by Administrator - * - * $Id$ - */ package larm.pipes; --- 1,2 ---- *************** *** 21,26 **** * PipelineManager * ! * @author Administrator ! * 30.06.2003 */ public class PipelineManager implements Configurable, Startable --- 16,21 ---- * PipelineManager * ! * @author ! * @version $Id$ */ public class PipelineManager implements Configurable, Startable |
|
From: <ot...@us...> - 2003-07-29 15:11:44
|
Update of /cvsroot/larm/larm/src/test/larm
In directory sc8-pr-cvs1:/tmp/cvs-serv23925/src/test/larm
Modified Files:
AllTests.java
Log Message:
- Big bad update, reorganization, etc.
Index: AllTests.java
===================================================================
RCS file: /cvsroot/larm/larm/src/test/larm/AllTests.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** AllTests.java 24 Jun 2003 17:19:02 -0000 1.1
--- AllTests.java 29 Jul 2003 15:11:41 -0000 1.2
***************
*** 1,35 ****
! /*
! * Created on 23.06.2003
! *
! */
!
! package larm;
!
! import junit.framework.Test;
! import junit.framework.TestSuite;
! import larm.config.ConfigurationTest;
! import larm.pipes.PipelineTest;
!
! /**
! * @author cmarschner
! *
! * contains all tests
! */
! public class AllTests
! {
!
! public static void main(String[] args)
! {
! junit.swingui.TestRunner.run(AllTests.class);
! }
!
! public static Test suite()
! {
! TestSuite suite = new TestSuite("Test for larm");
! //$JUnit-BEGIN$
! suite.addTestSuite(ConfigurationTest.class);
! suite.addTestSuite(PipelineTest.class);
! //$JUnit-END$
! return suite;
! }
! }
--- 1,31 ----
! package larm;
!
! import junit.framework.Test;
! import junit.framework.TestSuite;
! import larm.config.ConfigurationTest;
! import larm.pipes.PipelineTest;
!
! /**
! * Contains all tests.
! *
! * @author
! * @version $Id$
! */
! public class AllTests
! {
!
! public static void main(String[] args)
! {
! junit.swingui.TestRunner.run(AllTests.class);
! }
!
! public static Test suite()
! {
! TestSuite suite = new TestSuite("Test for larm");
! //$JUnit-BEGIN$
! suite.addTestSuite(ConfigurationTest.class);
! suite.addTestSuite(PipelineTest.class);
! //$JUnit-END$
! return suite;
! }
! }
|
|
From: <ot...@us...> - 2003-07-29 15:11:44
|
Update of /cvsroot/larm/larm/src/test/larm/config
In directory sc8-pr-cvs1:/tmp/cvs-serv23925/src/test/larm/config
Modified Files:
ConfigurationTest.java
Log Message:
- Big bad update, reorganization, etc.
Index: ConfigurationTest.java
===================================================================
RCS file: /cvsroot/larm/larm/src/test/larm/config/ConfigurationTest.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** ConfigurationTest.java 24 Jun 2003 17:19:32 -0000 1.1
--- ConfigurationTest.java 29 Jul 2003 15:11:41 -0000 1.2
***************
*** 1,95 ****
! /*
! * Created on 24.06.2003 by Administrator
! *
! * $Id$
! */
! package larm.config;
!
! import junit.framework.TestCase;
!
! /**
! * ConfigurationTest
! *
! * @author Administrator
! * 24.06.2003
! */
! public class ConfigurationTest extends TestCase
! {
!
! /**
! * Constructor for ConfigurationTest.
! * @param arg0
! */
! public ConfigurationTest(String arg0)
! {
! super(arg0);
! }
!
! public static void main(String[] args)
! {
! junit.swingui.TestRunner.run(ConfigurationTest.class);
! }
!
! /*
! * Test for String getPropertyAsString(String)
! */
! public void testGetPropertyAsStringString()
! {
! }
!
! /*
! * Test for String getPropertyAsString(String, String)
! */
! public void testGetPropertyAsStringStringString()
! {
! }
!
! /*
! * Test for long getPropertyAsLong(String)
! */
! public void testGetPropertyAsLongString()
! {
! }
!
! /*
! * Test for long getPropertyAsLong(String, long)
! */
! public void testGetPropertyAsLongStringlong()
! {
! }
!
! public void testGetPropertyAsDouble()
! {
! }
!
! public void testGetPropertyAsBoolean()
! {
! }
!
! public void testGetPropertyAsNrOfBytes()
! {
! }
!
! public void testGetSubConfig()
! {
! }
!
! public void testGetSubConfigList()
! {
! }
!
! /*
! * Test for void Configuration(Node)
! */
! public void testConfigurationNode()
! {
! }
!
! /*
! * Test for void Configuration(Reader)
! */
! public void testConfigurationReader()
! {
! }
!
! }
--- 1,124 ----
! package larm.config;
!
! import java.io.Reader;
! import java.io.StringReader;
!
! import junit.framework.TestCase;
! import larm.framework.config.Configuration;
!
! /**
! * ConfigurationTest
! *
! * @author
! * @version $Id$
! */
! public class ConfigurationTest extends TestCase
! {
! String xml = null;
! Reader xmlReader = null;
! Configuration c = null;
!
! public void setUp()
! {
! try
! {
! xml = "<?xml version=\"1.0\"?>" +
! "<larm>" +
! " <sources>" +
! " <fileSource>" +
! " <fileset dir=\"c:/larm/test/*.lst\"/>" +
! " <pipeline>testPipe</pipeline>" +
! " </fileSource>" +
! " </sources>" +
! " <pipelines>" +
! " " +
! " </pipelines>" +
! "</larm>";
! xmlReader = new StringReader(xml);
! c = new Configuration(xmlReader);
! }
! catch(Exception e)
! {
! TestCase.fail(e.toString());
! }
! }
!
! /**
! * Constructor for ConfigurationTest.
! * @param arg0
! */
! public ConfigurationTest(String arg0)
! {
! super(arg0);
! }
!
! public static void main(String[] args)
! {
! junit.swingui.TestRunner.run(ConfigurationTest.class);
! }
!
! /*
! * Test for String getPropertyAsString(String)
! */
! public void testGetPropertyAsStringString()
! {
! assertEquals("testPipe", c.getProperty("/larm/sources/fileSource/pipeline"));
! }
!
! /*
! * Test for String getPropertyAsString(String, String)
! */
! public void testGetPropertyAsStringStringString()
! {
! assertEquals("testPipe", c.getProperty("/larm/sources/fileSource/pipeline", "def1"));
! assertEquals("def1", c.getProperty("/larm/sources/fileSource/pipeline111", "def1"));
! }
!
! /*
! * Test for long getPropertyAsLong(String)
! */
! public void testGetPropertyAsLongString()
! {
! }
!
! /*
! * Test for long getPropertyAsLong(String, long)
! */
! public void testGetPropertyAsLongStringlong()
! {
! }
!
! public void testGetPropertyAsDouble()
! {
! }
!
! public void testGetPropertyAsBoolean()
! {
! }
!
! public void testGetPropertyAsNrOfBytes()
! {
! }
!
! public void testGetSubConfig()
! {
! }
!
! public void testGetSubConfigList()
! {
! }
!
! /*
! * Test for void Configuration(Node)
! */
! public void testConfigurationNode()
! {
! }
!
! /*
! * Test for void Configuration(Reader)
! */
! public void testConfigurationReader()
! {
! }
! }
|
|
From: <ot...@us...> - 2003-07-29 15:11:44
|
Update of /cvsroot/larm/larm/src/test/larm/pipes
In directory sc8-pr-cvs1:/tmp/cvs-serv23925/src/test/larm/pipes
Modified Files:
PipelineTest.java
Log Message:
- Big bad update, reorganization, etc.
Index: PipelineTest.java
===================================================================
RCS file: /cvsroot/larm/larm/src/test/larm/pipes/PipelineTest.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** PipelineTest.java 24 Jun 2003 17:20:09 -0000 1.1
--- PipelineTest.java 29 Jul 2003 15:11:41 -0000 1.2
***************
*** 1,38 ****
! /*
! * Created on 23.06.2003
! *
! * To change the template for this generated file go to
! * Window>Preferences>Java>Code Generation>Code and Comments
! */
! package larm.pipes;
!
! import junit.framework.TestCase;
!
! /**
! * @author Administrator
! */
! public class PipelineTest extends TestCase
! {
!
!
!
! /**
! * Constructor for PipelineTest.
! * @param arg0
! */
! public PipelineTest(String arg0)
! {
! super(arg0);
! }
!
! public void testSomething()
! {
! // TODO add code here
! }
!
! public static void main(String[] args)
! {
! junit.swingui.TestRunner.run(PipelineTest.class);
! }
!
! }
--- 1,29 ----
! package larm.pipes;
!
! import junit.framework.TestCase;
!
! /**
! * @author
! * @version $Id$
! */
! public class PipelineTest extends TestCase
! {
! /**
! * Constructor for PipelineTest.
! * @param arg0
! */
! public PipelineTest(String arg0)
! {
! super(arg0);
! }
!
! public void testSomething()
! {
! // TODO add code here
! }
!
! public static void main(String[] args)
! {
! junit.swingui.TestRunner.run(PipelineTest.class);
! }
! }
|
|
From: <ot...@us...> - 2003-07-29 15:11:44
|
Update of /cvsroot/larm/larm/src/java/larm/sources
In directory sc8-pr-cvs1:/tmp/cvs-serv23925/src/java/larm/sources
Added Files:
SourceManager.java
Log Message:
- Big bad update, reorganization, etc.
--- NEW FILE: SourceManager.java ---
package larm.sources;
import java.util.HashMap;
import java.util.Iterator;
import java.util.logging.Logger;
import larm.framework.Context;
import larm.framework.Contextualizable;
import larm.framework.Lifecycle;
import larm.framework.Startable;
import larm.framework.config.ConfigList;
import larm.framework.config.Configurable;
import larm.framework.config.Configuration;
import larm.framework.sources.Source;
/**
* SourceManager
*
* @author
* @version $Id: SourceManager.java,v 1.1 2003/07/29 15:11:41 otis Exp $
*/
public class SourceManager implements Configurable, Startable, Contextualizable
{
static Logger log = Logger.getLogger(SourceManager.class.getName());
HashMap sources = new HashMap();
public SourceManager()
{
}
/* (non-Javadoc)
* @see larm.config.Configurable#configure(larm.config.Configuration)
*/
public void configure(Configuration conf)
{
try
{
log.info("configuring SourceManager");
ConfigList list = conf.getSubConfigList("source");
for (int i = 0; i < list.length(); i++)
{
Configuration c = list.item(i);
String type = c.getProperty("@type");
String name = c.getProperty("@name");
log.info("found source of type " + type + " with name " + name);
Class clazz = Class.forName(type);
Source p = (Source)clazz.newInstance();
Lifecycle.configure(p, c);
sources.put(name, p);
}
if (list.length() == 0)
{
log.info("no pipelines to register");
}
}
catch(InstantiationException e)
{
}
catch(ClassNotFoundException e)
{
}
catch(IllegalAccessException e)
{
}
}
/* (non-Javadoc)
* @see larm.framework.Startable#start()
*/
public void start()
{
for (Iterator it = sources.values().iterator(); it.hasNext();)
{
Lifecycle.start(it.next());
}
}
/**
* @see larm.framework.Contextualizable#contextualize(larm.framework.Context)
*/
public void contextualize(Context ctx)
{
}
}
|
|
From: <ot...@us...> - 2003-07-29 15:11:43
|
Update of /cvsroot/larm/larm/src/java/larm/root
In directory sc8-pr-cvs1:/tmp/cvs-serv23925/src/java/larm/root
Added Files:
LARM.java
Log Message:
- Big bad update, reorganization, etc.
--- NEW FILE: LARM.java ---
package larm.root;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.logging.Level;
import java.util.logging.Logger;
import larm.config.PropertyManager;
import larm.framework.Context;
import larm.framework.Contextualizable;
import larm.framework.Lifecycle;
import larm.framework.Startable;
import larm.framework.config.Configurable;
import larm.framework.config.Configuration;
import larm.pipes.PipelineManager;
import larm.sources.SourceManager;
import org.xml.sax.SAXException;
/**
* LARM
*
* @author
* @version $Id: LARM.java,v 1.1 2003/07/29 15:11:41 otis Exp $
*/
public final class LARM implements Configurable, Contextualizable, Startable
{
/**
* reads the properties section and puts them to the system properties
*/
PropertyManager propertyManager = new PropertyManager();
/**
* reads the pipes section and registers all pipelines
*/
PipelineManager pipelineManager = new PipelineManager();
/**
* reads the sources section and initializes and starts all sources
*/
SourceManager sourceManager = new SourceManager();
/**
* this context contains the above objects
*/
Context context = new Context();
/**
* @see larm.config.Configurable#configure(larm.config.Configuration)
*/
public void configure(Configuration conf)
{
Lifecycle.configure(propertyManager, conf.getSubConfig("properties"));
Lifecycle.configure(pipelineManager, conf.getSubConfig("pipes"));
Lifecycle.configure(sourceManager, conf.getSubConfig("sources"));
}
/**
* @see larm.framework.Contextualizable#contextualize(larm.framework.Context)
*/
public void contextualize(Context ctx)
{
context.set("propertyManager", propertyManager);
context.set("sourceManager", sourceManager);
context.set("pipelineManager", pipelineManager);
Lifecycle.contextualize(propertyManager, context);
Lifecycle.contextualize(pipelineManager, context);
Lifecycle.contextualize(sourceManager, context);
}
/**
* @see larm.framework.Startable#start()
*/
public void start()
{
// Lifecycle.start(propertyManager);
Lifecycle.start(pipelineManager);
Lifecycle.start(sourceManager);
}
public static void main(String[] args)
{
System.out.println("LARM");
if(args.length != 1)
{
System.out.println("Usage: java larm.root.LARM <configfile.xml>");
}
Logger log = Logger.getLogger("");
log.setLevel(Level.CONFIG);
try
{
LARM larm = new LARM();
Configuration config = new Configuration(new FileReader(args[0]));
log.info("Configuring...");
Lifecycle.configure(larm, config.getSubConfig("/larm"));
log.info("Contextualizing...");
Lifecycle.contextualize(larm, null);
log.info("Starting...");
Lifecycle.start(larm);
}
catch(FileNotFoundException e)
{
System.out.println("Could not find file: " + args[0]);
}
catch(IOException e)
{
System.out.println("I/O Error while reading file: " + args[0]);
}
catch(SAXException e)
{
System.out.println("Error while parsing " + args[0] + ": " + e.getMessage());
}
}
}
|
|
From: <ot...@us...> - 2003-07-29 15:11:43
|
Update of /cvsroot/larm/larm/src/java/larm/framework/sources
In directory sc8-pr-cvs1:/tmp/cvs-serv23925/src/java/larm/framework/sources
Added Files:
Source.java
Log Message:
- Big bad update, reorganization, etc.
--- NEW FILE: Source.java ---
package larm.framework.sources;
/**
* Source
*
* @author
* @version $Id: Source.java,v 1.1 2003/07/29 15:11:40 otis Exp $
*/
public interface Source
{
}
|
Update of /cvsroot/larm/larm/src/java/larm/framework/pipes
In directory sc8-pr-cvs1:/tmp/cvs-serv23925/src/java/larm/framework/pipes
Added Files:
BlockingPipeline.java MemoryQueue.java Message.java
MessageProcessor.java NonblockingPipeline.java Pipeline.java
Queue.java
Log Message:
- Big bad update, reorganization, etc.
--- NEW FILE: BlockingPipeline.java ---
package larm.framework.pipes;
import java.util.ArrayList;
import java.util.logging.Level;
import java.util.logging.Logger;
import larm.framework.Lifecycle;
import larm.framework.Startable;
import larm.framework.config.ConfigList;
import larm.framework.config.Configurable;
import larm.framework.config.Configuration;
/**
* BlockingPipeline
*
* @author
* @version $Id: BlockingPipeline.java,v 1.1 2003/07/29 15:11:40 otis Exp $
*/
public class BlockingPipeline implements Pipeline, Configurable, Startable
{
static Logger log=null;
ArrayList processorList = new ArrayList();
MessageProcessor[] processors;
int numProcessors;
/**
* @see larm.pipes.Pipeline#putMessage(larm.pipes.Message)
*/
public Message process(Message message)
{
for(int i = 0; i<numProcessors; i++)
{
message = processors[i].process(message);
if(message == null)
{
break;
}
}
return message;
}
/**
* @see larm.pipes.Pipeline#addMessageProcessor(larm.pipes.MessageProcessor)
*/
public void addMessageProcessor(MessageProcessor p)
{
processorList.add(p);
processors = (MessageProcessor[])processorList.toArray();
numProcessors = processors.length;
}
/**
* @see larm.config.Configurable#configure(larm.config.Configuration)
*/
public void configure(Configuration conf)
{
if(log == null)
{
log = Logger.getLogger(this.getClass().getName());
}
try
{
ConfigList processors = conf.getSubConfigList("processor");
for(int i = 0; i<processors.length(); i++)
{
Configuration processor = processors.item(i);
String type = processor.getProperty("@type");
Class clazz = Class.forName(type);
MessageProcessor newMP = (MessageProcessor)clazz.newInstance();
Lifecycle.configure(newMP, processor);
addMessageProcessor(newMP);
}
}
catch(IllegalArgumentException e)
{
log.log(Level.SEVERE,"config", e);
}
catch(ClassNotFoundException e)
{
log.log(Level.SEVERE,"class not found", e);
}
catch(IllegalAccessException e)
{
log.log(Level.SEVERE,"illegal access", e);
}
catch(ClassCastException e)
{
log.log(Level.SEVERE,"not a MessageProcessor", e);
}
catch(InstantiationException e)
{
log.log(Level.SEVERE,"could not instantiate MessageProcessor", e);
}
}
/**
* @see larm.framework.Startable#start()
*/
public void start()
{
for(int i = 0; i<processors.length; i++)
{
Lifecycle.start(processors[i]);
}
}
}
--- NEW FILE: MemoryQueue.java ---
package larm.framework.pipes;
import java.util.LinkedList;
/**
* MemoryQueue
*
* @author
* @version $Id: MemoryQueue.java,v 1.1 2003/07/29 15:11:40 otis Exp $
*/
public class MemoryQueue implements Queue
{
LinkedList queue = new LinkedList();
int size=0;
int used=0;
public MemoryQueue()
{}
public void init(int maxSize)
{
queue.clear();
size = maxSize;
used = 0;
}
public int getMaxSize()
{
return size;
}
public synchronized int length()
{
return used;
}
/**
* @see larm.pipes.Queue#dequeue()
*/
public synchronized Object dequeue()
{
try
{
if(used == 0)
{
wait();
}
if(used>0)
{
Object object = queue.removeLast();
used--;
notify();
return object;
}
}
catch(InterruptedException e)
{
assert false;
}
return null;
}
/**
* @see larm.pipes.Queue#enqueue(java.lang.Object)
*/
public synchronized void enqueue(Object object)
{
try
{
if(used >= size)
{
wait();
}
if(used < size)
{
queue.addFirst(object);
used++;
notify();
}
return;
}
catch(InterruptedException e)
{
assert false;
}
}
}
--- NEW FILE: Message.java ---
package larm.framework.pipes;
/**
* Message
*
* @author
* @version
*/
public abstract class Message
{
}
--- NEW FILE: MessageProcessor.java ---
package larm.framework.pipes;
/**
* MessageProcessor
*
* @author
* @version $Id: MessageProcessor.java,v 1.1 2003/07/29 15:11:40 otis Exp $
*/
public interface MessageProcessor
{
Message process(Message m);
}
--- NEW FILE: NonblockingPipeline.java ---
package larm.framework.pipes;
import java.util.logging.Level;
import java.util.logging.Logger;
import larm.framework.Lifecycle;
import larm.framework.Startable;
import larm.framework.config.Configurable;
import larm.framework.config.Configuration;
/**
* NonblockingPipeline
*
* @author
* @version
*/
public class NonblockingPipeline implements Pipeline, Configurable, Startable, Runnable
{
BlockingPipeline pipe = new BlockingPipeline();
Queue queue;
Thread pollingThread;
static Logger log = Logger.getLogger(NonblockingPipeline.class.getName());
final static long DEFAULT_QUEUE_SIZE = 1000;
/**
* @see larm.pipes.Pipeline#addMessageProcessor(larm.pipes.MessageProcessor)
*/
public void addMessageProcessor(MessageProcessor p)
{
pipe.addMessageProcessor(p);
}
/**
* will block when the queue is full
* @see larm.pipes.Pipeline#putMessage(larm.pipes.Message)
*/
public synchronized Message process(Message message)
{
queue.enqueue(message);
return message;
}
/**
* @see larm.config.Configurable#configure(larm.config.Configuration)
*/
public void configure(Configuration conf)
{
try
{
if(conf.contains("queue"))
{
Configuration queueConfig = conf.getSubConfig("queue");
String type = queueConfig.getProperty("@type");
queue = (Queue)Class.forName(type).newInstance();
Lifecycle.configure(queue, queueConfig);
}
else
{
queue = new MemoryQueue();
}
queue.init((int)conf.getPropertyAsLong("@queueSize", DEFAULT_QUEUE_SIZE));
pipe.configure(conf);
}
catch(IllegalAccessException e)
{
log.log(Level.SEVERE, "illegal access", e);
}
catch(ClassCastException e)
{
log.log(Level.SEVERE, "not a queue class", e);
}
catch(InstantiationException e)
{
log.log(Level.SEVERE, "could not instantiate queue", e);
}
catch(ClassNotFoundException e)
{
log.log(Level.SEVERE, "could not find class for queue", e);
}
}
/**
* @see larm.framework.Startable#start()
*/
public void start()
{
Lifecycle.start(pipe);
pollingThread = new Thread(this);
pollingThread.start();
}
/**
* @see java.lang.Runnable#run()
*/
public void run()
{
while(true)
{
Message m = (Message)queue.dequeue(); // blocks when empty
pipe.process(m);
}
}
}
--- NEW FILE: Pipeline.java ---
package larm.framework.pipes;
/**
* Pipeline
*
* @author
* @version $Id: Pipeline.java,v 1.1 2003/07/29 15:11:40 otis Exp $
*/
public interface Pipeline extends MessageProcessor
{
void addMessageProcessor(MessageProcessor p);
}
--- NEW FILE: Queue.java ---
package larm.framework.pipes;
/**
* Queue
*
* @author
* @version $Id: Queue.java,v 1.1 2003/07/29 15:11:40 otis Exp $
*/
public interface Queue
{
void init(int maxSize);
void enqueue(Object object);
Object dequeue();
int getMaxSize();
int length();
}
|
|
From: <ot...@us...> - 2003-07-29 15:11:43
|
Update of /cvsroot/larm/larm/src/java/larm/framework/config
In directory sc8-pr-cvs1:/tmp/cvs-serv23925/src/java/larm/framework/config
Added Files:
ConfigList.java Configurable.java Configuration.java
Log Message:
- Big bad update, reorganization, etc.
--- NEW FILE: ConfigList.java ---
package larm.framework.config;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
/**
* ConfigList
* encapsulates org.xml.dom.NodeList. returned by @see{Configuration.getSubConfigList()}
*
* @author
* @version $Id: ConfigList.java,v 1.1 2003/07/29 15:11:40 otis Exp $
*/
public class ConfigList
{
NodeList nl;
Node root;
protected ConfigList(NodeList nodes, Node rootNode)
{
nl = nodes;
root = rootNode;
}
/**
* @return the number of Configuration objects contained in this list;
*/
public int length()
{
return nl.getLength();
}
/**
* returns a configuration item [will be created from a NodeList object
* when this method is called]
* @param index index of the item. Must not exceed @see{#getLength()}
* @return the configuration
*/
public Configuration item(int index)
{
return new Configuration(nl.item(index), root);
}
}
--- NEW FILE: Configurable.java ---
package larm.framework.config;
/**
* Configurable
*
* @author
* @version $Id: Configurable.java,v 1.1 2003/07/29 15:11:40 otis Exp $
*/
public interface Configurable
{
void configure(Configuration conf);
}
--- NEW FILE: Configuration.java ---
package larm.framework.config;
import java.io.IOException;
import java.io.Reader;
import java.util.StringTokenizer;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerException;
import org.apache.xpath.XPathAPI;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
/**
* Configuration encapsulates an XML file and provides an API similar to the
* java.util.Properties class while retaining the benefits of a hierarchical
* configuration file. Node contents are accessed via
* XPath expressions. Configuration doesn't expose any XML APIs. XPath Node lists
* are encapsulated via @see{ConfigList}.
*
* @author
* @version $Id: Configuration.java,v 1.1 2003/07/29 15:11:40 otis Exp $
*/
public class Configuration
{
/**
* the node represented by this configuration.
*/
private Node doc;
/**
* root node for evaluating absolute XPath expressions
*/
private Node rootNode;
protected NodeList getNodes(String xpath) throws IllegalArgumentException
{
try
{
return XPathAPI.selectNodeList(doc, xpath);
}
catch(TransformerException e)
{
throw new IllegalArgumentException("illegal argument: '" + xpath + "' (root cause: " + e + ")");
}
}
protected Node getThisNode()
{
return doc;
}
protected Node getNode(String xpath) throws IllegalArgumentException
{
try
{
return XPathAPI.selectSingleNode(doc, xpath);
}
catch(TransformerException e)
{
throw new IllegalArgumentException("illegal argument: '" + xpath + "' (root cause: " + e + ")");
}
}
public boolean contains(String xpath)
{
try
{
return XPathAPI.selectSingleNode(doc, xpath) != null;
}
catch(TransformerException e)
{
throw new IllegalArgumentException("illegal argument: '" + xpath + "' (root cause: " + e + ")");
}
}
/**
* resolves strings like
* <ol>
* <li>"foo ${my.property} bar". my.property is read
* from the systems properties and inserted into the string.
* <li>"foo $${/my/xpath} bar". /my/path must be a valid xpath argument
* that can be translated into a string. It is resolved using the configuration's
* root element
* </ol>
* resolveProperty works recursively: If a property contains another property
* placeholder, up to 10 levels of recursion will be resolved
* @param prop
* @param recurse
* @return
*/
protected String resolveProperty(String prop, int recurse)
{
char[] c = new char[prop.length()];
prop.getChars(0,prop.length(),c,0);
int l = c.length;
StringBuffer s = new StringBuffer(l * 2);
final int CHAR = 0;
final int DOLLAR = 1;
final int DOLLAR2 = 2;
int start = 0;
final int PROPERTY = 3;
final int XPATH = 4;
StringBuffer v = new StringBuffer();
int mode = CHAR;
for(int i = 0; i<c.length; i++)
{
char ac = c[i];
switch(mode)
{
case CHAR:
if(ac == '$')
{
mode = DOLLAR;
start = i;
}
else
{
s.append(ac);
}
break;
case DOLLAR:
if(ac == '{')
{
mode = PROPERTY;
}
else if(ac == '$')
{
mode = DOLLAR2;
}
else
{
mode = CHAR;
s.append('$');
s.append(ac);
}
break;
case DOLLAR2:
if(ac == '{')
{
mode = XPATH;
}
else
{
mode = CHAR;
s.append('$');
s.append('$');
}
break;
case PROPERTY:
if(ac == '}')
{
// got Java property. resolve it.
String varName = new String(c,start + 2,i-start-2);
String value = getJavaPropertyAsStringDontResolve(varName);
if(recurse == 0)
{
throw new IllegalStateException("recursion limit reached while resolving " + varName + " -> " + value);
}
String p = resolveProperty(value,recurse-1);
s.append(p);
mode = CHAR;
}
case XPATH:
if(ac == '}')
{
// got XPath expression. resolve it.
String varName = new String(c,start + 2,i-start-2);
String value = getPropertyAsStringDontResolve(rootNode,varName);
if(recurse == 0)
{
throw new IllegalStateException("recursion limit reached while resolving " + varName + " -> " + value);
}
String p = resolveProperty(value,recurse-1);
s.append(p);
mode = CHAR;
}
}
}
if(mode != CHAR)
{
s.append(c,start,l-start);
}
return s.toString();
}
protected String getJavaPropertyAsStringDontResolve(String varName)
{
return System.getProperty(varName, "");
}
public String getPropertyAsStringDontResolve(String xpath) throws IllegalArgumentException
{
return getPropertyAsStringDontResolve(doc, xpath);
}
protected String getPropertyAsStringDontResolve(Node root, String xpath) throws IllegalArgumentException
{
try
{
return XPathAPI.eval(root, "string(" + xpath + ")").str();
}
catch(TransformerException e)
{
throw new IllegalArgumentException("illegal argument: '" + xpath + "' (root cause: " + e + ")");
}
}
/**
* returns a property as a string or null if the property was not set
* @param xpath an XPath expression like '/foo[@bar="value"]'
* @return a string, empty if expression didn't exist
* @throws IllegalArgumentException if there was a problem with the XPath
* @throws IllegalStateException if properties couldn't be resolved (e.g. recursion level reached)
*/
public String getProperty(String xpath) throws IllegalArgumentException, IllegalStateException
{
return resolveProperty(getPropertyAsStringDontResolve(doc,xpath), 10);
}
/**
* returns a property as a string or the default value if the property wasn't set
* @param xpath an XPath expression like '/foo[@bar="value"]'
* @return a string
* @throws IllegalArgumentException if there was a problem with the XPath
*/
public String getProperty(String xpath, String def) throws IllegalArgumentException
{
String p = getProperty(xpath);
return !"".equals(p) ? p : def;
}
/**
* returns a property as a long or 0 if it was not set
* @param xpath an XPath expression like './foo[@bar="value"]'
* @return the value or 0
* @throws IllegalArgumentException if there was a problem with the XPath
*/
public long getPropertyAsLong(String xpath) throws IllegalArgumentException, NumberFormatException
{
return getPropertyAsLong(xpath, 0);
}
/**
* returns a property as a long or the default value if it was not set
* @param xpath an XPath expression like './foo[@bar="value"]'
* @return the value
* @throws IllegalArgumentException if there was a problem with the XPath
*/
public long getPropertyAsLong(String xpath, long def) throws IllegalArgumentException, NumberFormatException
{
try
{
return Long.parseLong(XPathAPI.eval(doc, "string(" + xpath + ")").str());
}
catch(TransformerException e)
{
throw new IllegalArgumentException("illegal argument: '" + xpath + "' (root cause: " + e + ")");
}
catch(NullPointerException npe)
{
return def;
}
}
/**
* returns a property as a double or 0 if it was not set
* @param xpath an XPath expression like './foo[@bar="value"]'
* @return the value
* @throws IllegalArgumentException if there was a problem with the XPath
*/
public double getPropertyAsDouble(String xpath) throws IllegalArgumentException
{
try
{
return Double.parseDouble(XPathAPI.eval(doc, "string(" + xpath + ")").str());
}
catch(TransformerException e)
{
throw new IllegalArgumentException("illegal argument: '" + xpath + "' (root cause: " + e + ")");
}
}
/**
* returns a property as a boolean or the default value if it was not set
* @param xpath an XPath expression like './foo[@bar="value"]'
* @param deflt the default value
* @return the value
* @throws IllegalArgumentException if there was a problem with the XPath
*/
public boolean getPropertyAsBoolean(String xpath, boolean deflt) throws IllegalArgumentException
{
try
{
return Boolean.valueOf(XPathAPI.eval(doc, "string(" + xpath + ")").str()).booleanValue();
}
catch(TransformerException e)
{
throw new IllegalArgumentException("illegal argument: '" + xpath + "' (root cause: " + e + ")");
}
catch(NullPointerException e)
{
return deflt;
}
}
/**
* returns a property as a long value indicating a number of bytes or the default
* value if the property was not set.
* The input string must conform to (NUMBER (BYTES|KB|MB|GB))+
* with BYTES= EMPTY|b|byte|bytes,
* KB = k|kb|kbyte|kbytes|kilobyte|kilobytes
* MB = m|mb|mbyte|mbytes|megabyte|megabytes
* GB = g|gb|gbyte|gbytes|gigabyte|gigabytes
* and EMPTY = the empty string<p>
* example
* <ul>
* <li>"2000" (2000 bytes)
* <li>"10 kb" (or "10 k", "1 kbyte", "10 kbytes", "1 kilobyte", "10 kilobytes")
* <li>"3.4 mb" (or "m", "mbyte", "mbytes", "megabyte", "megabytes")
* <li>"0.3 gb" (or "g", "gbyte", "gbytes", "gigabyte", "gigabytes")
* </ul>
* which are all resolved to their byte values (we say k = kb = kilo = 1024
* although this is not perfectly correct since kilo = 1000; the same applies
* to mb or gb)<p>
* Tokens are separated by whitespace
* @param xpath an XPath expression like './foo[@bar="value"]'
* @param deflt the default value
* @return the value
* @throws IllegalArgumentException if there was a problem with the XPath
*/
public long getPropertyAsNrOfBytes(String xpath, long deflt) throws IllegalArgumentException
{
try
{
return parseNrOfBytes(XPathAPI.eval(doc, "string(" + xpath + ")").str());
}
catch(TransformerException e)
{
throw new IllegalArgumentException("illegal argument: '" + xpath + "' (root cause: " + e + ")");
}
catch(NullPointerException e)
{
return deflt;
}
}
private long parseNrOfSeconds(String s)
{
StringTokenizer t = new StringTokenizer(s);
long ret = 0;
while(t.hasMoreTokens())
{
double t1 = Double.parseDouble(t.nextToken());
int mult = 1;
if(t.hasMoreTokens())
{
String type = t.nextToken().toLowerCase();
if("d".equals(type) || "day".equals(type) || "days".equals(type))
{
mult = 3600 * 24;
}
if("h".equals(type) || "hours".equals(type))
{
mult = 3600;
}
else if("m".equals(type) || "min".equals(type) || "mins".equals(type) || "minutes".equals(type))
{
mult = 60;
}
else if(!("s".equals(type) || "sec".equals(type) || "seconds".equals(type)))
{
throw new IllegalArgumentException("s|sec|seconds|m|min|mins|minutes|h|hours expected. (Argument was: '" + type + "')");
}
}
ret += (long)(t1 * mult);
}
return ret;
}
protected long parseNrOfBytes(String s)
{
StringTokenizer t = new StringTokenizer(s);
long ret = 0;
while(t.hasMoreTokens())
{
double t1 = Double.parseDouble(t.nextToken());
long mult = 1;
if(t.hasMoreTokens())
{
String type = t.nextToken().toLowerCase();
if("k".equals(type) || "kb".equals(type) || "kbyte".equals(type) || "kbytes".equals(type))
{
mult = 1024;
}
else if("mb".equals(type) || "mbyte".equals(type) || "mbytes".equals(type) || "megabyte".equals(type) || "megabytes".equals(type))
{
mult = 1024 * 1024;
}
else if("gb".equals(type) || "gbyte".equals(type) || "gbytes".equals(type) || "gigabyte".equals(type) || "gigabytes".equals(type))
{
mult = 1024 * 1024 * 1024;
}
else
{
throw new IllegalArgumentException("k|kb|kbyte|kbytes|mb|mbyte|mbytes|megabyte|megabytes|gb|gbyte|gbytes|gigabytes expected. (Argument was: '" + type + "')");
}
}
ret += (long)(t1 * mult);
}
return ret;
}
/**
* returns a property as a long value indicating a number of seconds or the default
* value if the property was not set.
* The input string must conform to (NUMBER (SECS|MINS|HOURS|DAYS))+
* with SECS= EMPTY|s|sec|secs|second|seconds,
* MINS = m|min|mins|minutes and
* HOURS = h|hour|hours
* DAYS = d|day|days
* where EMPTY is the empty string. Tokens are separated by whitespace
* examples:
* <ul>
* <li>"30" (30 seconds)
* <li>"2 mins 10 secs"
* <li>"3.5 hours"
* <li>"1 day"
* </ul>
* @param xpath
* @return the value in seconds
*/
protected long getPropertyAsNrOfSeconds(String xpath, long dfault) throws IllegalArgumentException
{
try
{
return parseNrOfSeconds(XPathAPI.eval(doc, "string(" + xpath + ")").str());
}
catch(TransformerException e)
{
throw new IllegalArgumentException("illegal argument: '" + xpath + "' (root cause: " + e + ")");
}
catch(NullPointerException n)
{
return dfault;
}
}
/**
* returns a configuration object representing the sub-graph of the given XML
* tree
* @param xpath an expression denoting a node that contains a sub-tree.
* @return the configuration or null if the node doesn't exist
* @throws IllegalArgumentException if x
*/
public Configuration getSubConfig(String xpath) throws IllegalArgumentException
{
Node n = getNode(xpath);
if(n == null)
{
//return null;
throw new IllegalArgumentException("expected tag '" + xpath + "' in configuration file");
}
return new Configuration(n, rootNode);
}
/**
* returns a list of Configuration objects representing sub-graphs of the
* given XML tree, specified by an xPath expression
* @param xpath an expression denoting a list of nodes that contain a sub-tree.
* @return the configuration or null if the node doesn't exist
* @throws IllegalArgumentException if x
*/
public ConfigList getSubConfigList(String xpath) throws IllegalArgumentException
{
NodeList nl = getNodes(xpath);
if(nl == null)
{
throw new IllegalArgumentException("node '" + xpath + "' does not exist");
}
return new ConfigList(nl, rootNode);
}
protected Configuration(Node node, Node root)
{
this.doc = node;
this.rootNode = root;
}
/**
* Constructor. Node will be the root node
* @param node root node
*/
public Configuration(Node node)
{
doc = rootNode = node;
}
/**
* Constructor. config must contain a valid XML file.
* @param config the XML file this Configuration represents.
* @throws IOException
* @throws SAXException
*/
public Configuration(Reader config) throws IOException, SAXException
{
try
{
// inspired by http://cafeconleche.org/books/xmljava/chapters/ch16s06.html
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = null;
factory.setNamespaceAware(true);
builder = factory.newDocumentBuilder();
InputSource data = new InputSource(config);
doc = rootNode = builder.parse(data);
}
catch(ParserConfigurationException e)
{
throw new RuntimeException("Couldn't initialize parser", e);
}
}
}
|
|
From: <ot...@us...> - 2003-07-29 15:11:43
|
Update of /cvsroot/larm/larm/src/java/larm/framework
In directory sc8-pr-cvs1:/tmp/cvs-serv23925/src/java/larm/framework
Added Files:
Context.java Contextualizable.java Lifecycle.java
Startable.java Stoppable.java
Log Message:
- Big bad update, reorganization, etc.
--- NEW FILE: Context.java ---
package larm.framework;
import java.util.HashMap;
import java.util.Iterator;
/**
* Context
*
* @author
* @version $Id: Context.java,v 1.1 2003/07/29 15:11:40 otis Exp $
*/
public class Context
{
private HashMap context = new HashMap();
public void set(String key, Object value)
{
context.put(key, value);
}
public Object get(String key)
{
return context.get(key);
}
public Iterator keys()
{
return context.keySet().iterator();
}
}
--- NEW FILE: Contextualizable.java ---
package larm.framework;
/**
* Contextualizable
*
* @author
* @version $Id: Contextualizable.java,v 1.1 2003/07/29 15:11:40 otis Exp $
*/
public interface Contextualizable
{
public void contextualize(Context ctx);
}
--- NEW FILE: Lifecycle.java ---
package larm.framework;
import larm.framework.config.Configurable;
import larm.framework.config.Configuration;
/**
* LifecycleHelper
*
* @author
* @version $Id: Lifecycle.java,v 1.1 2003/07/29 15:11:40 otis Exp $
*/
public class Lifecycle
{
public static void configure(Object o, Configuration c)
{
if(o instanceof Configurable)
{
((Configurable)o).configure(c);
}
}
public static void start(Object o)
{
if(o instanceof Startable)
{
((Startable)o).start();
}
}
public static void stop(Object o)
{
if(o instanceof Stoppable)
{
((Stoppable)o).stop();
}
}
public static void contextualize(Object o, Context c)
{
if(o instanceof Contextualizable)
{
((Contextualizable)o).contextualize(c);
}
}
}
--- NEW FILE: Startable.java ---
package larm.framework;
/**
* Startable
*
* @author
* @version $Id: Startable.java,v 1.1 2003/07/29 15:11:40 otis Exp $
*/
public interface Startable
{
void start();
}
--- NEW FILE: Stoppable.java ---
package larm.framework;
/**
* Stoppable
*
* @author
* @version $Id: Stoppable.java,v 1.1 2003/07/29 15:11:40 otis Exp $
*/
public interface Stoppable
{
void stop();
}
|
|
From: <ot...@us...> - 2003-07-29 15:11:42
|
Update of /cvsroot/larm/larm/src/java/larm/config
In directory sc8-pr-cvs1:/tmp/cvs-serv23925/src/java/larm/config
Modified Files:
PropertyManager.java
Removed Files:
ConfigList.java Configuration.java
Log Message:
- Big bad update, reorganization, etc.
Index: PropertyManager.java
===================================================================
RCS file: /cvsroot/larm/larm/src/java/larm/config/PropertyManager.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** PropertyManager.java 24 Jul 2003 11:48:08 -0000 1.1
--- PropertyManager.java 29 Jul 2003 15:11:40 -0000 1.2
***************
*** 1,6 ****
- /*
- *
- * $Id$
- */
package larm.config;
--- 1,2 ----
***************
*** 18,31 ****
*
* @author
*/
public class PropertyManager implements Configurable
{
!
static Logger log = Logger.getLogger(PropertyManager.class.getName());;
!
public PropertyManager()
{
}
!
/**
* @see larm.config.Configurable#configure(larm.config.Configuration)
--- 14,28 ----
*
* @author
+ * @version $Id$
*/
public class PropertyManager implements Configurable
{
!
static Logger log = Logger.getLogger(PropertyManager.class.getName());;
!
public PropertyManager()
{
}
!
/**
* @see larm.config.Configurable#configure(larm.config.Configuration)
***************
*** 43,51 ****
System.setProperty(name, value);
}
!
props = conf.getSubConfigList("property[@file]");
for(int i = 0; i < props.length(); i++)
{
! String fileName=null;
try
{
--- 40,48 ----
System.setProperty(name, value);
}
!
props = conf.getSubConfigList("property[@file]");
for(int i = 0; i < props.length(); i++)
{
! String fileName=null;
try
{
--- ConfigList.java DELETED ---
--- Configuration.java DELETED ---
|
|
From: <ot...@us...> - 2003-07-29 15:11:42
|
Update of /cvsroot/larm/larm/src/config
In directory sc8-pr-cvs1:/tmp/cvs-serv23925/src/config
Added Files:
empty.xml
Log Message:
- Big bad update, reorganization, etc.
--- NEW FILE: empty.xml ---
<?xml version="1.0"?>
<larm>
<properties>
<property name="pipe1name" value="pi"/>
<property file="my.properties"/>
</properties>
<sources>
</sources>
<pipes>
<nonblockingPipeline name="${pipe1name}">
</nonblockingPipeline>
</pipes>
</larm>
|
|
From: <ot...@us...> - 2003-07-29 15:10:43
|
Update of /cvsroot/larm/larm/src/java/larm/pipes
In directory sc8-pr-cvs1:/tmp/cvs-serv23663/src/java/larm/pipes
Added Files:
PipelineManager.java
Log Message:
- Initial checkin.
--- NEW FILE: PipelineManager.java ---
/*
* Created on 30.06.2003 by Administrator
*
* $Id: PipelineManager.java,v 1.1 2003/07/29 15:10:40 otis Exp $
*/
package larm.pipes;
import java.util.HashMap;
import java.util.Iterator;
import java.util.logging.Logger;
import larm.framework.Lifecycle;
import larm.framework.Startable;
import larm.framework.config.ConfigList;
import larm.framework.config.Configurable;
import larm.framework.config.Configuration;
import larm.framework.pipes.NonblockingPipeline;
import larm.framework.pipes.Pipeline;
/**
* PipelineManager
*
* @author Administrator
* 30.06.2003
*/
public class PipelineManager implements Configurable, Startable
{
static Logger log = Logger.getLogger(PipelineManager.class.getName());
HashMap pipes = new HashMap();
public Pipeline getPipeline(String name)
{
return (Pipeline)pipes.get(name);
}
/**
* @see larm.config.Configurable#configure(larm.config.Configuration)
*/
public void configure(Configuration conf)
{
ConfigList list = conf.getSubConfigList("nonblockingPipeline | blockingPipeline");
for(int i = 0; i<list.length(); i++)
{
Configuration c = list.item(i);
String type = c.getProperty("name()");
String name = c.getProperty("@name");
log.info("found pipe of type " + type + " with name " + name);
if("nonBlockingPipeline".equals(type))
{
NonblockingPipeline p = new NonblockingPipeline();
Lifecycle.configure(p, c);
pipes.put(name, p);
}
else if("blockingPipeline".equals(type))
{
NonblockingPipeline p = new NonblockingPipeline();
Lifecycle.configure(p, c);
pipes.put(name, p);
}
else
{
assert false;
}
}
if(list.length() == 0)
{
log.info("no pipelines to register");
}
}
/**
* @see larm.framework.Startable#start()
*/
public void start()
{
for(Iterator it = pipes.values().iterator(); it.hasNext();)
{
Lifecycle.start(it.next());
}
}
}
|
|
From: <ot...@us...> - 2003-07-24 12:26:51
|
Update of /cvsroot/larm/larm/src/java/larm/framework/sources In directory sc8-pr-cvs1:/tmp/cvs-serv32549/src/java/larm/framework/sources Log Message: Directory /cvsroot/larm/larm/src/java/larm/framework/sources added to the repository |
|
From: <ot...@us...> - 2003-07-24 12:26:51
|
Update of /cvsroot/larm/larm/src/java/larm/framework/pipes In directory sc8-pr-cvs1:/tmp/cvs-serv32549/src/java/larm/framework/pipes Log Message: Directory /cvsroot/larm/larm/src/java/larm/framework/pipes added to the repository |
|
From: <ot...@us...> - 2003-07-24 12:26:51
|
Update of /cvsroot/larm/larm/src/java/larm/framework/config In directory sc8-pr-cvs1:/tmp/cvs-serv32549/src/java/larm/framework/config Log Message: Directory /cvsroot/larm/larm/src/java/larm/framework/config added to the repository |
|
From: <ot...@us...> - 2003-07-24 12:22:55
|
Update of /cvsroot/larm/larm/src/java/larm/sources In directory sc8-pr-cvs1:/tmp/cvs-serv31947/src/java/larm/sources Log Message: Directory /cvsroot/larm/larm/src/java/larm/sources added to the repository |
|
From: <ot...@us...> - 2003-07-24 12:22:55
|
Update of /cvsroot/larm/larm/src/java/larm/framework In directory sc8-pr-cvs1:/tmp/cvs-serv31947/src/java/larm/framework Log Message: Directory /cvsroot/larm/larm/src/java/larm/framework added to the repository |
|
From: <ot...@us...> - 2003-07-24 12:22:55
|
Update of /cvsroot/larm/larm/src/test/larm/framework In directory sc8-pr-cvs1:/tmp/cvs-serv31947/src/test/larm/framework Log Message: Directory /cvsroot/larm/larm/src/test/larm/framework added to the repository |
|
From: <ot...@us...> - 2003-07-24 12:22:55
|
Update of /cvsroot/larm/larm/src/config In directory sc8-pr-cvs1:/tmp/cvs-serv31947/src/config Log Message: Directory /cvsroot/larm/larm/src/config added to the repository |
|
From: <ot...@us...> - 2003-07-24 12:22:55
|
Update of /cvsroot/larm/larm/src/java/larm/root In directory sc8-pr-cvs1:/tmp/cvs-serv31947/src/java/larm/root Log Message: Directory /cvsroot/larm/larm/src/java/larm/root added to the repository |
|
From: <ot...@us...> - 2003-07-24 12:18:21
|
Update of /cvsroot/larm/larm/docs
In directory sc8-pr-cvs1:/tmp/cvs-serv31326/docs
Modified Files:
contents.txt crawler.txt framework.txt indexer.txt
packages.txt processors.txt
Log Message:
- Updated.
Index: contents.txt
===================================================================
RCS file: /cvsroot/larm/larm/docs/contents.txt,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** contents.txt 24 Jun 2003 17:50:44 -0000 1.5
--- contents.txt 24 Jul 2003 12:18:17 -0000 1.6
***************
*** 1,85 ****
!
! Specification Document for LARM.
!
! $Id$
!
! Log:
! ---------------+-----------+---------------------------------------------------
! cmarschn 10-Jun-03 Created. Will write all but the parts in ()
! cmarschn 11-Jun-03 Added sections for framework, extended crawler,
! common development patterns
! cmarschn 15-Jun-03 Worked on the crawler part, wrote framwork
! cmarschn 20-Jun-03
! cmarschn 23-Jun-03
! ---------------+-----------+---------------------------------------------------
!
!
! Contents
!
! -------------------------------------------------------------------------------
!
! [Part I: Framework] framework.txt
!
! I. Messaging Framework framework.txt
! 1. Pipelines
! 2. Sources and Drains
! 3. Notifications or Polling
! 4. Batch file operation
! 5. Batch file indexing
!
! II. Configuration framework.txt
! 1. XML Configuration
! 2. Configuration files
! 3. Startup/Shutdown
!
! [Part II: Gatherers]
!
! III. Crawler crawler.txt
! 1. Crawl Requests
! 3. DNS Handling
! 4. Robot Exclusion
! 5. Link Analysis
! 6. Distribution
! 7. Persistence
! 8. Configuration
! 9. Log File(s)
! 10. Recrawls
!
! (IV. File System Gatherer)
! 1. Configuration
! 2. Reindexing
!
! (V. Database Gatherer)
!
! (VI. Other Sources (JMS, Mail, Web Services...))
!
! [Part III: Record Processors] processors.txt
!
! VII. Format conversion (PDF, Word, HTML etc.)
! VIII. Link Extraction
! IX. Distribution to different index fields
! X. Applying link analysis to document weights
!
! [Part IV: Indexer] indexer.txt
!
! XI. The Indexer
! 1. Message formats
! 2. Persistence
! 3. Configuration
! 4. Log File(s)
!
! ([Part V: Search])
!
! (XII. Search interface)
! (XIII. Data Display)
!
! [Part VI: Common Development Patterns]
! XIV. Logging
! XV. Test Cases
! XVI. Package layout
!
! [Part VII: Appendix]
!
! XVII. Used Packages packages.txt
! XVIII. Glossary
--- 1,85 ----
!
! Specification Document for LARM.
!
! $Id$
!
! Log:
! ---------------+-----------+---------------------------------------------------
! cmarschn 10-Jun-03 Created. Will write all but the parts in ()
! cmarschn 11-Jun-03 Added sections for framework, extended crawler,
! common development patterns
! cmarschn 15-Jun-03 Worked on the crawler part, wrote framwork
! cmarschn 20-Jun-03
! cmarschn 23-Jun-03
! ---------------+-----------+---------------------------------------------------
!
!
! Contents
!
! -------------------------------------------------------------------------------
!
! [Part I: Framework] framework.txt
!
! I. Messaging Framework framework.txt
! 1. Pipelines
! 2. Sources and Drains
! 3. Notifications or Polling
! 4. Batch file operation
! 5. Batch file indexing
!
! II. Configuration framework.txt
! 1. XML Configuration
! 2. Configuration files
! 3. Startup/Shutdown
!
! [Part II: Gatherers]
!
! III. Crawler crawler.txt
! 1. Crawl Requests
! 3. DNS Handling
! 4. Robot Exclusion
! 5. Link Analysis
! 6. Distribution
! 7. Persistence
! 8. Configuration
! 9. Log File(s)
! 10. Recrawls
!
! (IV. File System Gatherer)
! 1. Configuration
! 2. Reindexing
!
! (V. Database Gatherer)
!
! (VI. Other Sources (JMS, Mail, Web Services...))
!
! [Part III: Record Processors] processors.txt
!
! VII. Format conversion (PDF, Word, HTML etc.)
! VIII. Link Extraction
! IX. Distribution to different index fields
! X. Applying link analysis to document weights
!
! [Part IV: Indexer] indexer.txt
!
! XI. The Indexer
! 1. Message formats
! 2. Persistence
! 3. Configuration
! 4. Log File(s)
!
! ([Part V: Search])
!
! (XII. Search interface)
! (XIII. Data Display)
!
! [Part VI: Common Development Patterns]
! XIV. Logging
! XV. Test Cases
! XVI. Package layout
!
! [Part VII: Appendix]
!
! XVII. Used Packages packages.txt
! XVIII. Glossary
Index: crawler.txt
===================================================================
RCS file: /cvsroot/larm/larm/docs/crawler.txt,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** crawler.txt 24 Jun 2003 17:51:48 -0000 1.5
--- crawler.txt 24 Jul 2003 12:18:17 -0000 1.6
***************
*** 1,272 ****
!
! $Id$
!
! -------------------------------------------------------------------------------
! III. The Crawler
!
! The crawler contains a special type of pipeline whose configuration is very
! limited. The reason is that the crawler parts use some shared data structures
! and contain some internal dependencies (e.g. the order in which different
! processing steps are done). Nevertheless we decided to keep up the pipeline
! paradigm to separate concerns into different classes and to avoid a large
! "Crawler" class that contains such different operations like Document fetching
! and robot exclusion.
!
! 1. The Fetcher: Crawl Requests and Crawler Output
!
! The Fetcher sits at the core of the crawler. It takes CrawlRequests and outputs
! raw CrawlRecords.
!
! A crawl request consists of the following fields:
!
! method: one of NEW or CHECK_FOR_RECRAWL [or CHECK_FOR_SERVER_RUNNING]. NEW
! loads a document as given by a URL. If network errors occur the fetcher can
! be parameterized to wait and retry a couple of times, set the host to
! BAD_STATE if unsuccessful, and output a SERVER_PROBLEM message. CHECK checks
! the document for changes (in the MD5). The crawler may behave differently when
! CHECK_FOR_RECRAWL or CHECK_FOR_SERVER_RUNNING was chosen. In the latter case
! the crawler may decide to check the host only once. In the case of CHECK an
! MD5 checksum has to be provided. [When using CHECK we don't look for changed
! Dates since they have proven to be unreliable.]
!
! url: URL: The URL of the document to crawl
!
! MD5Hash: MD5Hash: An MD5 Hash of the document, if method is CHECK
!
! interface CrawlRequest {
!
! // enum RequestMethod
! final static byte NEW = 1;
! final static byte CHECK_FOR_RECRAWL = 2;
!
! byte requestMethod; // type RequestMethod
! URL url;
! [long lastModified;] // if CHECK_FOR_RECRAWL, can be sent as
! If-Modified-Since]
! MD5Hash MD5Hash; // set to null if requestMethod == NEW
! }
!
! A CrawRecord is the output of a crawler that contains the raw document as loaded
! by the crawler threads. It contains the following fields:
!
! url: URL: The original URL of the crawl
!
! finalURL: URL: The final URL if HTTP responds with 30x result codes. The
! crawler can be configured a maximum number of detours to take if such a result
! code occurs.
!
! requestMethod: byte. request method as in CrawlRequest
!
! fingerprint: MD5Hash. hash value of the document contents.
!
! HTTPstatus: short. The HTTP status code as returned by the last try. e.g. 200
!
! crawlerStatus: short. error code if not reflected through the HTTPStatus code
!
! MIMEType: String. The MIME type of the document loaded. e.g. "text/html"
!
! encoding: String. The document encoding if provided
!
! lastModified: Date. time when the doc was last changed.
!
! headers: the HTTP headers returned
!
! encoding: String. Content-Encoding as specified in a HTTP header
!
! contents: Object. Either a byte[] or a char[] depending on the MIME type and
! encoding. Since HTML or XML files themselves may contain an "encoding"
! attribute on their own the fetcher doesn't make any assumptions on the real
! content tyspe.
!
! interface CrawlRecord {
!
! // crawler status
! final static byte CS_OK 0 // if HTTPStatus == 200
! final static byte CS_ERROR_IN_HTTP 1 // if HTTPStatus != 200
! final static byte CS_TOO_MANY_REDIRECTS 2 // e.g. 301/302 redirect loop
! final static byte CS_UNKNOWN_HOST 3 // host name doesn't exist
! final static byte CS_HOST_NOT_REACHABLE 4 // server not running
! final static byte CS_READ_TIMEOUT 5 // server or network too slow
! final static byte CS_NO_ROUTE_TO_HOST 6 // network problem
! // (NoRouteToHostException)
! final static byte CS_PORT_CLOSED 7 // no server running on this
! // port (ConnectException)
! final static byte CS_FILE_TOO_LARGE 8 // file exceeded maximum size
! // and was truncated
! final static byte CS_IO_EXCEPTION 100 // unknown IO exception
!
! URL url; // -> IndexRecord.URI
! URL finalURL; // -> IndexRecord.secondaryURIs[0]
! byte requestMethod; // see above
! MD5Hash fingerprint; // -> IndexRecord.fingerprint
! short HTTPStatus; // HTTP status code
! byte crawlerStatus; // that are not reflected in HTTPStatus
! String MIMEType; // IndexRecord.MIMEType
! String[][] headers; // HTTP headers
! String encoding; // ISO, UTF, Base64, Gzip, etc.
! long lastModified; // same as in CrawlRequest if not modified, else timestamp
! byte[] contents;
! }
!
! The Fetcher is controlled by a FetcherManager that distributes CrawlRequests
! among different threads. The threads get batches of crawl requests if available
! to minimize synchronization. They can also be configured such that they collect
! a couple of documents before they put them to the output queue.
!
! We will use the hashCode of the hostname modulo the number of threads to assign
! fetches to the different threads. Each thread will have a priority queue and a
! small host name cache for the incoming requests. (for the start we will use
! Javas built-in host name cache). This way a thread can do its work without the
! need to communicate with or block other threads.
!
! The priority queue is used to keep hosts in a wait state while new hosts are
! crawled. Each time a page is crawled from a host it will come into a wait state
! for a configurable threshold until the next request is issued.
!
! [If implemented using non blocking-IO it may also be that a thread keeps
! downloading more than one host at once. This is presumably faster since it saves
! a lot of threads and with it the task switching overhead. The old IO also needs
! the data to be copied a couple of times. The best implementation still has to be
! figured out. Presumably a set of Fetcher threads that are responsible for a
! number of hosts and use non-blocking IO will show the best performance.]
!
! The Fetcher tries to be completely bound to network I/O and will not perform
! extractions if the content is compressed (that is, it sends an "accept-encoding:
! gzip" message if configured but will not perform a decompression step).
!
! The Fetcher also has to keep track of the hosts. Since it cannot hold infos
! about all hosts in RAM, a (LRU) caching mechanism has to be used that contains
! the following information for each host (HostInfo):
!
! hostName: String: a DNS name as an identifier
! IP-Address: InetAddrss: IP-Adress of this host
! ipExpires: long: Expiry time for the IP cache
! robots: : a data structure that is used by a RobotsTxtFilter
! robotsExpires: long: a time that defines when robots.txt has to be reloaded
!
!
! interface HostInfo {
! String hostName;
! InetAddress ipAddress;
! long ipExpires
! ? robots;
! long robotsExpires;
! }
!
! Since HostInfos are looked up using their hostNames they should be stored in a
! simple hash with the hostName as its key.
!
! From the caching point of view it is advisable that incoming CrawlRequests are
! not evenly distributed over the host name space. From a network efficiency point
! of view exactly this should be the case. This conflict may be resolved in the
! following way: Say a batch of CrawlRequests contains a maximum of 5000 hosts and
! a maximum of 100,000 requests. If one of these numbers is exceeded the batch is
! cut into several pieces that each obeys these rules. Then the HostInfo cache can
! be as large as the number of hosts in the batch and may only need to access
! secondary storage as a new batch is started. It can be implemented as a simple
! LRU cache.
!
! 3. DNS Handling
!
! Since DNS resolution takes a lot of time it is advisable to store ipAddresses of
! host names in the HostInfo structure. This calls for a URL implementation that
! doesn't do a resolution on its own, and an HTTP 1.1 implementation that can use
! the ipAddress as given in the HostInfo structure.
!
! For now we use Jakarta HTTPClient which doesn't do address resolution.
! Currently DNS resolution works the following way:
! a) a request to open a connection is sent to HTTPClient
! b) HTTPClient creates a new java.net.Socket with the host name as its argument
! c) if the host name is an IP address, Socket opens it directly.
! If it is a DNS name,
! d) Socket calls getCachedAddress()
! e) getCachedAddress will first perform a linear scan through its host name list
! to see whether resolved names have expired
! f) the host name is looked up in the cache. If it is not found, it is resolved
! through an internal Naming Service class and saved to the cache.
! Since e) takes linear time even when the name is in the cache it unnecessarily
! slows things down if we have 1000s of host names in the cache. In this case
! we would have to resolve the IP address for ourselves, or HTTPClient would have
! to do it, since it later needs the host name for sending an HTTP 1.1 request.
!
!
! 4. Robot Exclusion
!
! Since the incoming CrawlRequests may have been generated a long time ago the
! fetcher has to take care about changed robot exclusion policies while it is
! fetching the documents. For this sake a filter has to be applied shortly before
! a request is made to the server, and robots.txt files have to be reloaded before
! the first request to a server is made and after a specific time has elapsed.
!
! 6. Persistence
!
! CrawlRequests are usually performed in batches that are read from secondary
! storage. These files again may contain a large number of requests that are read
! in steps of <n> requests as specified in the config. Fast crawls demand for a
! large number of hosts in these files and for avoidance of the same hosts in
! subsequent URLs [see Shapenyuk/Suel].
!
! CrawlRecords again are also written in batches of <n> records and are also
! distributed among several files. They may also be distributed among different
! directories in order to use NFS as a cheap distribution mechanism for the
! indexing step.
!
!
! 7. Distribution
!
! A Fetcher/FetcherManager combination can be distributed among different hosts if
! extracted links are divided such that a node is made responsible for a distinct
! set of hosts. The communication between different crawler nodes takes place in
! batches. To avoid a central component that distributes these Collections of
! CrawlRequests, each node has to know about the other nodes and which hosts this
! node addresses.
! It seems viable to use the hash value of the hostname of the URL to be crawled
! to split this up. But this is supposed to be done in a processing component like
! directly after link extraction. In the Shkabenyuk/Suel this is named the
! "crawling application". Thus it is not part of the crawler itself.
!
! For ease of use the crawler should adapt if a new crawler node is added. Say
! there are three nodes, and all crawl requests are divided into three queues that
! are distributed to these nodes. If a new node is started, the crawling
! application should get a message and start dividing the URLs into four pieces.
! On the other hand, if more than one crawling application is needed, the fetchers
! need to know where to send the downloaded files. This again could be divided by
! the URL. A similar mechanism should apply.
!
! 8. Configuration
!
! - FetcherManager
! - method: NIO or old IO
! - number of threads
! - NIO: number of concurrent requests (=concurrent hosts) per thread
! - number of seconds between subsequent requests to a host
! - number of redirects to follow after page is quit with TOO_MANY_REDIRECTS
! - maximum file size
! - number of seconds to wait for a server to send the file completely
! - HTTP User Agent String
! - size of host name cache
! - size of temp cache for loading docs
! - use "Accept-Encoding: gzip [Compress, Deflate?]"
!
! 9. From CrawlRecords to IndexRecords
!
! Crawl- and IndexRecords seem to be pretty similar, but in fact they differ in a
! variety of features.
! An IndexRecord is crawl-agnostic. It is used for different document sources and
! thus doesn't know about HTTP status codes and the like.
!
! [There will be a converter between Crawl- and IndexRecords at some time in the
! pipeline. This will be configurable such that CrawlRecord entries may become
! generic fields within an IndexRecord]
!
! 9. Log Files
!
! 10. Incremental Crawling
!
! 11. Startup/Shutdown
!
! 12. Packages and Dependencies
!
!
!
!
--- 1,272 ----
!
! $Id$
!
! -------------------------------------------------------------------------------
! III. The Crawler
!
! The crawler contains a special type of pipeline whose configuration is very
! limited. The reason is that the crawler parts use some shared data structures
! and contain some internal dependencies (e.g. the order in which different
! processing steps are done). Nevertheless we decided to keep up the pipeline
! paradigm to separate concerns into different classes and to avoid a large
! "Crawler" class that contains such different operations like Document fetching
! and robot exclusion.
!
! 1. The Fetcher: Crawl Requests and Crawler Output
!
! The Fetcher sits at the core of the crawler. It takes CrawlRequests and outputs
! raw CrawlRecords.
!
! A crawl request consists of the following fields:
!
! method: one of NEW or CHECK_FOR_RECRAWL [or CHECK_FOR_SERVER_RUNNING]. NEW
! loads a document as given by a URL. If network errors occur the fetcher can
! be parameterized to wait and retry a couple of times, set the host to
! BAD_STATE if unsuccessful, and output a SERVER_PROBLEM message. CHECK checks
! the document for changes (in the MD5). The crawler may behave differently when
! CHECK_FOR_RECRAWL or CHECK_FOR_SERVER_RUNNING was chosen. In the latter case
! the crawler may decide to check the host only once. In the case of CHECK an
! MD5 checksum has to be provided. [When using CHECK we don't look for changed
! Dates since they have proven to be unreliable.]
!
! url: URL: The URL of the document to crawl
!
! MD5Hash: MD5Hash: An MD5 Hash of the document, if method is CHECK
!
! interface CrawlRequest {
!
! // enum RequestMethod
! final static byte NEW = 1;
! final static byte CHECK_FOR_RECRAWL = 2;
!
! byte requestMethod; // type RequestMethod
! URL url;
! [long lastModified;] // if CHECK_FOR_RECRAWL, can be sent as
! If-Modified-Since]
! MD5Hash MD5Hash; // set to null if requestMethod == NEW
! }
!
! A CrawRecord is the output of a crawler that contains the raw document as loaded
! by the crawler threads. It contains the following fields:
!
! url: URL: The original URL of the crawl
!
! finalURL: URL: The final URL if HTTP responds with 30x result codes. The
! crawler can be configured a maximum number of detours to take if such a result
! code occurs.
!
! requestMethod: byte. request method as in CrawlRequest
!
! fingerprint: MD5Hash. hash value of the document contents.
!
! HTTPstatus: short. The HTTP status code as returned by the last try. e.g. 200
!
! crawlerStatus: short. error code if not reflected through the HTTPStatus code
!
! MIMEType: String. The MIME type of the document loaded. e.g. "text/html"
!
! encoding: String. The document encoding if provided
!
! lastModified: Date. time when the doc was last changed.
!
! headers: the HTTP headers returned
!
! encoding: String. Content-Encoding as specified in a HTTP header
!
! contents: Object. Either a byte[] or a char[] depending on the MIME type and
! encoding. Since HTML or XML files themselves may contain an "encoding"
! attribute on their own the fetcher doesn't make any assumptions on the real
! content tyspe.
!
! interface CrawlRecord {
!
! // crawler status
! final static byte CS_OK 0 // if HTTPStatus == 200
! final static byte CS_ERROR_IN_HTTP 1 // if HTTPStatus != 200
! final static byte CS_TOO_MANY_REDIRECTS 2 // e.g. 301/302 redirect loop
! final static byte CS_UNKNOWN_HOST 3 // host name doesn't exist
! final static byte CS_HOST_NOT_REACHABLE 4 // server not running
! final static byte CS_READ_TIMEOUT 5 // server or network too slow
! final static byte CS_NO_ROUTE_TO_HOST 6 // network problem
! // (NoRouteToHostException)
! final static byte CS_PORT_CLOSED 7 // no server running on this
! // port (ConnectException)
! final static byte CS_FILE_TOO_LARGE 8 // file exceeded maximum size
! // and was truncated
! final static byte CS_IO_EXCEPTION 100 // unknown IO exception
!
! URL url; // -> IndexRecord.URI
! URL finalURL; // -> IndexRecord.secondaryURIs[0]
! byte requestMethod; // see above
! MD5Hash fingerprint; // -> IndexRecord.fingerprint
! short HTTPStatus; // HTTP status code
! byte crawlerStatus; // that are not reflected in HTTPStatus
! String MIMEType; // IndexRecord.MIMEType
! String[][] headers; // HTTP headers
! String encoding; // ISO, UTF, Base64, Gzip, etc.
! long lastModified; // same as in CrawlRequest if not modified, else timestamp
! byte[] contents;
! }
!
! The Fetcher is controlled by a FetcherManager that distributes CrawlRequests
! among different threads. The threads get batches of crawl requests if available
! to minimize synchronization. They can also be configured such that they collect
! a couple of documents before they put them to the output queue.
!
! We will use the hashCode of the hostname modulo the number of threads to assign
! fetches to the different threads. Each thread will have a priority queue and a
! small host name cache for the incoming requests. (for the start we will use
! Javas built-in host name cache). This way a thread can do its work without the
! need to communicate with or block other threads.
!
! The priority queue is used to keep hosts in a wait state while new hosts are
! crawled. Each time a page is crawled from a host it will come into a wait state
! for a configurable threshold until the next request is issued.
!
! [If implemented using non blocking-IO it may also be that a thread keeps
! downloading more than one host at once. This is presumably faster since it saves
! a lot of threads and with it the task switching overhead. The old IO also needs
! the data to be copied a couple of times. The best implementation still has to be
! figured out. Presumably a set of Fetcher threads that are responsible for a
! number of hosts and use non-blocking IO will show the best performance.]
!
! The Fetcher tries to be completely bound to network I/O and will not perform
! extractions if the content is compressed (that is, it sends an "accept-encoding:
! gzip" message if configured but will not perform a decompression step).
!
! The Fetcher also has to keep track of the hosts. Since it cannot hold infos
! about all hosts in RAM, a (LRU) caching mechanism has to be used that contains
! the following information for each host (HostInfo):
!
! hostName: String: a DNS name as an identifier
! IP-Address: InetAddrss: IP-Adress of this host
! ipExpires: long: Expiry time for the IP cache
! robots: : a data structure that is used by a RobotsTxtFilter
! robotsExpires: long: a time that defines when robots.txt has to be reloaded
!
!
! interface HostInfo {
! String hostName;
! InetAddress ipAddress;
! long ipExpires
! ? robots;
! long robotsExpires;
! }
!
! Since HostInfos are looked up using their hostNames they should be stored in a
! simple hash with the hostName as its key.
!
! From the caching point of view it is advisable that incoming CrawlRequests are
! not evenly distributed over the host name space. From a network efficiency point
! of view exactly this should be the case. This conflict may be resolved in the
! following way: Say a batch of CrawlRequests contains a maximum of 5000 hosts and
! a maximum of 100,000 requests. If one of these numbers is exceeded the batch is
! cut into several pieces that each obeys these rules. Then the HostInfo cache can
! be as large as the number of hosts in the batch and may only need to access
! secondary storage as a new batch is started. It can be implemented as a simple
! LRU cache.
!
! 3. DNS Handling
!
! Since DNS resolution takes a lot of time it is advisable to store ipAddresses of
! host names in the HostInfo structure. This calls for a URL implementation that
! doesn't do a resolution on its own, and an HTTP 1.1 implementation that can use
! the ipAddress as given in the HostInfo structure.
!
! For now we use Jakarta HTTPClient which doesn't do address resolution.
! Currently DNS resolution works the following way:
! a) a request to open a connection is sent to HTTPClient
! b) HTTPClient creates a new java.net.Socket with the host name as its argument
! c) if the host name is an IP address, Socket opens it directly.
! If it is a DNS name,
! d) Socket calls getCachedAddress()
! e) getCachedAddress will first perform a linear scan through its host name list
! to see whether resolved names have expired
! f) the host name is looked up in the cache. If it is not found, it is resolved
! through an internal Naming Service class and saved to the cache.
! Since e) takes linear time even when the name is in the cache it unnecessarily
! slows things down if we have 1000s of host names in the cache. In this case
! we would have to resolve the IP address for ourselves, or HTTPClient would have
! to do it, since it later needs the host name for sending an HTTP 1.1 request.
!
!
! 4. Robot Exclusion
!
! Since the incoming CrawlRequests may have been generated a long time ago the
! fetcher has to take care about changed robot exclusion policies while it is
! fetching the documents. For this sake a filter has to be applied shortly before
! a request is made to the server, and robots.txt files have to be reloaded before
! the first request to a server is made and after a specific time has elapsed.
!
! 6. Persistence
!
! CrawlRequests are usually performed in batches that are read from secondary
! storage. These files again may contain a large number of requests that are read
! in steps of <n> requests as specified in the config. Fast crawls demand for a
! large number of hosts in these files and for avoidance of the same hosts in
! subsequent URLs [see Shapenyuk/Suel].
!
! CrawlRecords again are also written in batches of <n> records and are also
! distributed among several files. They may also be distributed among different
! directories in order to use NFS as a cheap distribution mechanism for the
! indexing step.
!
!
! 7. Distribution
!
! A Fetcher/FetcherManager combination can be distributed among different hosts if
! extracted links are divided such that a node is made responsible for a distinct
! set of hosts. The communication between different crawler nodes takes place in
! batches. To avoid a central component that distributes these Collections of
! CrawlRequests, each node has to know about the other nodes and which hosts this
! node addresses.
! It seems viable to use the hash value of the hostname of the URL to be crawled
! to split this up. But this is supposed to be done in a processing component like
! directly after link extraction. In the Shkabenyuk/Suel this is named the
! "crawling application". Thus it is not part of the crawler itself.
!
! For ease of use the crawler should adapt if a new crawler node is added. Say
! there are three nodes, and all crawl requests are divided into three queues that
! are distributed to these nodes. If a new node is started, the crawling
! application should get a message and start dividing the URLs into four pieces.
! On the other hand, if more than one crawling application is needed, the fetchers
! need to know where to send the downloaded files. This again could be divided by
! the URL. A similar mechanism should apply.
!
! 8. Configuration
!
! - FetcherManager
! - method: NIO or old IO
! - number of threads
! - NIO: number of concurrent requests (=concurrent hosts) per thread
! - number of seconds between subsequent requests to a host
! - number of redirects to follow after page is quit with TOO_MANY_REDIRECTS
! - maximum file size
! - number of seconds to wait for a server to send the file completely
! - HTTP User Agent String
! - size of host name cache
! - size of temp cache for loading docs
! - use "Accept-Encoding: gzip [Compress, Deflate?]"
!
! 9. From CrawlRecords to IndexRecords
!
! Crawl- and IndexRecords seem to be pretty similar, but in fact they differ in a
! variety of features.
! An IndexRecord is crawl-agnostic. It is used for different document sources and
! thus doesn't know about HTTP status codes and the like.
!
! [There will be a converter between Crawl- and IndexRecords at some time in the
! pipeline. This will be configurable such that CrawlRecord entries may become
! generic fields within an IndexRecord]
!
! 9. Log Files
!
! 10. Incremental Crawling
!
! 11. Startup/Shutdown
!
! 12. Packages and Dependencies
!
!
!
!
Index: framework.txt
===================================================================
RCS file: /cvsroot/larm/larm/docs/framework.txt,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** framework.txt 30 Jun 2003 14:19:36 -0000 1.3
--- framework.txt 24 Jul 2003 12:18:17 -0000 1.4
***************
*** 1,316 ****
!
! $Id$
!
! -------------------------------------------------------------------------------
! Part I. Framework
! -------------------------------------------------------------------------------
!
!
! I. Configuration
!
! Configuration drove the first discussions about LARM since it was a major
! weakness of the old crawler that these issues hadn't been properly addressed.
!
! In general, several options exist:
! 1. Use a property file
! 2. Use an XML file
! 3. Use several XML files and separate pipeline construction and parameterization
! 4. Use configuration messages that are passed through the pipelines and allow
! for reconfiguring it at runtime.
! The fourth option would be nice if the crawler should be controlled via a web
! interface or the like. The third one resembles the Avalon Phoenix model,
! although it is not sure if that really does the same.
!
! After the discussion we came to the conclusion that
! - Java property files are too restricted to model pipelines
! - Avalon seems to be overkill and contradicts KISS
!
! Nevertheless we use some of the Avalon ideas, namely:
! - A component initializes its subcomponents by calling a configure() method.
! Maybe other lifecycle mehtods may be implemented as well.
! - configure() gets its part of the configuration file. It is up to the enclosing
! component to cut out the right part (using the class below and XPath)
!
!
! 1. XML Configuration
!
! At this time we use a single XML file to form and configure the pipelines.
!
! Configuration is done through a single class that wraps a DOM represenation of
! the XML and facilitates access through XPath.
!
! Currently the interface looks like this:
!
! class Configuration
! {
! Configuration(Reader config);
! Configuration getSubConfig(String xpath);
!
! String getPropertyAsString(String xpath);
! X getPropertyAsX(String xpath);
!
! Node getCurrentNode(); // can we hide this?
! Node getNode(String xpath); // can we hide this?
! NodeList getNodes(String xpath); // can we hide this?
! }
!
! Configuration can resolve strings like ${my.property} to a system property or
! something like $${/my/xpath/} to an xpath expression from the current file.
!
! [Remark]
!
! The LARM main program analyzes the following subsections:
! <properties> <pipes> and <sources>
!
! The properties section is similar to ANT's properties section. Its contents is
! read at startup time. Dependencies are resolved when a property is used (i.e.
! resolved by an underlying component).
!
! The <pipes> and <sources> sections are passed to two global class instances (in
! Avalon they would be called blocks): The config.PropertyManager,
! pipes.PipeManager and the sources.SourceManager.
!
! Each of these classes initializes its subcomponents in the same way they are
! initialized. This is very similar to Avalon's Inversion of Control pattern
! (IoC):
!
! All pipeline classes (PipeManager, SourceManager, Source, MessageProcessor,
! etc.) have or can have a method called "configure(Configuration c)", derived
! from a lifecycle interface called config.Configurable.
!
!
! 2. Startup/Shutdown
!
! LARM gets the path to an XML configuration file as a parameter. Different server
! modes depend on the sources and pipeline configurations in these files.
!
! Startup should be something like
!
! java larm.root.LARM <configfile>
!
! LARM then
! 1. resolves properties in the <properties> section through
! config.PropertyManager
! 2. initialises the pipelines and registers them
! 3. initialises the sources and registers them
! 4. passes the registry of sources and pipes to the classes implementing
! the framework.Contextualizable interface
! 5. calls "configure" on each of the pipelines (through PipeManager.configure())
! 6. calls "configure" on each of the sources (through SourceManager.configure())
! 7. calls "start" on the nonblocking pipes (through PipeManager.start())
! 8. calls "start" on the nonblocking sources (through SourceManager.start())
!
!
! When is LARM shut down? Since pipelines naturally wait for incoming messages,
! this depends on the nature of the Sources and other services. For development we
! will most likely use sources that run through a directory, emit the messages
! contained to a pipeline, and shut down. That means the source may signal that
! the application should exit. Since it is likely that the pipelines are still at
! work, the app will have to wait until all messages are consumed and processed.
!
! [There may be other services that may call for a shutdown: a CTRL-C handler or a
! web service interface.]
!
!
! II. Messaging Framework
!
! LARM basically is concerned with processing pieces of data and moving it along
! what we call a processing pipeline.
!
! The pipeline framework is a set of classes that simplifies this task: It allows
! for a separation of different assembly parts of the whole system. That way ´
! different parts of the pipeline can be put into different classes and can be
! developed rather independently.
!
! In contrast to message-queue systems it is a low-level in-process framework: If
! it is known that only thread is involved, the components need not even be thread
! safe. The aim is to be able to process a very large number of small messages
! very rapidly.
!
!
! 1 Active and Passive Components
!
! Active Components run in their own thread. They may respond to external events
! (socket calls, timer events or whatsoever). Passive Components just provide
! services to other passive or active components. Sources (see below) will mostly
! be active components. That is, they operate the subsequent pipeline.
!
! A MessageProcessor (MP) is a simple class that is called by a pipeline to handle
! a message. It may alter the message, filter it, save it somewhere, etc. It
! either returns null (forming a message sink) or it returns the message (most of
! the time the same message it got, but it may also return a different one).
! Examples of an MP would be a RobotExclusionFilter (filtering some of the URLs
! from the URL list), a PDF to XML converter (reducing PDF to a common metaformat
! that is understood by the indexing component), a FileSystemStorage that saves
! incoming documents on disk, a JMSStorage that saves them to a message queue, or
! a LuceneStorage that adds a document to a Lucene index. An MP could as well
! contain a BlockingPipeline (see below) forming a nested pipeline.
!
! [Is the storage a required part of the pipeline? If so I think we should break
! it up into more distinct pieces to there can be some control programmatically.
! If not is there a required order?]
!
! 2. MessagePipelines
!
! MessageProcessors are put together into message pipelines. There are two types
! of them: BlockingPipelines and NonblockingPipelines.
!
! Pipelines process objects of type Message:
!
! interface Message implements Serializable
! {}
!
! You can see that this is a very generic concept. Its behavior only depends on
! the processor implementations. Messages have to be serializable since they will
! mostly stay on disk.
! Messages should only be data containers and should not contain business logic
! or be dependent on types other than primitive types, Collections, or strings.
! Objects included in the message should form a part-of relationship, no
! referential relationship, since this would make serialization and
! deserialization much more complicated.
!
! Messages are put into pipelines:
!
! interface Pipeline implements MessageProcessor
! {
! public Message processMessage(Message);
! public Message processMessages(Collection); // Collection<Message>
! }
!
! There are two types of pipelines: BlockingPipelines and NonblockingPipelines.
!
! A BlockingPipeline processes a message by calling (at most) all of its
! MessageProcessors in a row. A MessageProcessor gets the Message, may alter it,
! and returns it again. The reference returned is passed to the next processor in
! the row. After the last MP the resulting message is returned.
!
! BlockingPipeline may be designed not to be thread safe (i.e. because it is used
! from within an NonBlockingPipeline and thus only accessed by one thread), as do
! the MPs. (A BlockingPipeline may as well be an MP, which allows for nesting
! pipelines).
!
! A NonBlockingPipeline has an extra thread that handles the messages. Therefore,
! at a processMessage() call, the message is written into a message queue and
! always returns null. The processor thread handles all messages until the queue
! is empty. Internally the AsynchronousProcessor consists of a BlockingPipeline
! that is operated by the ProcessorThread.
!
! The Queue implementation will usually be an in-memory FIFO queue, but may be
! exchanged depending on the needs. A queue may block if it is full.
!
! The MessageProcessor interface looks like this:
!
! interface MessageProcessor
! {
! public Message processMessage(Message);
! }
!
! If you implement a MessageProcessor and need lifecycle methods, you can
! implement one or more of the interfaces larm.config.Configurable,
! larm.framework.Contextualizable, or larm.framework.Startable. The Pipeline will
! take care to call the methods contained in these interfaces in the order as
! specified in section I.
!
! Configuration:
!
! pipeline parameters:
!
! - @name: A global name that identifies the pipe. If existent, the pipeline
! will be registered within the PipelineManager.
! - processors: A block of processors. They are put into the pipeline in the
! order in which they are specified in the configuration.
!
! Additionally, NonblockingPipeline has the following parameters:
!
! - @queueSize: integer. number of messages the queue is able to handle. If there
! are more messages, a call to putMessage() will block until all messages are
! fed into the queue
! - queue (optional): sets a different queue implementation than the default
! larm.pipes.InMemoryQueue.
! Parameter: @type: A class name of type larm.pipes.Queue that is used for the
! queue. May contain config parameters in the block
!
! Example:
!
! <blockingPipeline name="pipe1">
! <processors>
! <processor type="larm.processors.DoNothingProcessor">
! <someArg>someVal</someArg>
! </processor>
! </processors>
! </blockingPipeline>
!
! <nonBlockingPipeline name="pipe2" queueSize="100">
! <queue type="myPackage.myQueue">
! <myQueueParameter/>
! </queue>
! <processors>
! <processor type="larm.processors.DoNothingProcessor">
! <someArg>someVal</someArg>
! </processor>
! <processor type="larm.processors.MyFancyProcessor"/>
! </processors>
! </blockingPipeline>
!
! 3. Sources and Drains
!
! Sources are classes that actively pump new messages into a pipeline. In the
! simplest case a source loads a file given as a parameter, puts it into a
! pipeline, and exits.
!
! Framework provides the following sources:
!
! - FileSource
! reads messages from a given file or a set of files, puts them into the
! pipeline, and exits.
! The file must be a valid batch.
!
! parameters:
! - fileName: The file to read
! - fileSet: a file set like in ANT. Describes files to be put in to
! the queue
! - pipeline: The name of the pipeline
! - delete true/false: delete file(s) after they were put to the pipe.
!
!
! xml config example:
! <fileSource>
! <fileName>c:/larm/test/file1</fileName>
! <pipeline>testPipe</pipeline>
! </fileSource>
!
! example 2:
! <fileSource>
! <fileset dir="c:/larm/test/*.lst"/>
! <pipeline>testPipe</pipeline>
! </fileSource>
!
! - FileMonitorSource
! monitors a set of files given by the fileset parameter and looks for changes
! in the set described by the fileset pattern.
! when new files are found, they are appended to an internal in-memory queue.
! These files are then put into the pipeline given, deleted, and deleted from
! the internal in-memory queue.
!
! parameters:
! - fileset
! - delay: time (in seconds) between runs of the monitor
!
! <fileMonitorSource>
! <fileset dir="somedir"/>
! <pipeline>testPipe</pipeline>
! <delay>30 s</delay>
! </fileMonitorSource>
!
! 4. Notifications or Poll[ing]
!
!
!
! 5. Batch file operation
!
! A batch file contains a set of Objects inherited from the type Message. They are
! read in blocks
!
! [6. Batch file indexing]
!
!
--- 1,316 ----
!
! $Id$
!
! -------------------------------------------------------------------------------
! Part I. Framework
! -------------------------------------------------------------------------------
!
!
! I. Configuration
!
! Configuration drove the first discussions about LARM since it was a major
! weakness of the old crawler that these issues hadn't been properly addressed.
!
! In general, several options exist:
! 1. Use a property file
! 2. Use an XML file
! 3. Use several XML files and separate pipeline construction and parameterization
! 4. Use configuration messages that are passed through the pipelines and allow
! for reconfiguring it at runtime.
! The fourth option would be nice if the crawler should be controlled via a web
! interface or the like. The third one resembles the Avalon Phoenix model,
! although it is not sure if that really does the same.
!
! After the discussion we came to the conclusion that
! - Java property files are too restricted to model pipelines
! - Avalon seems to be overkill and contradicts KISS
!
! Nevertheless we use some of the Avalon ideas, namely:
! - A component initializes its subcomponents by calling a configure() method.
! Maybe other lifecycle mehtods may be implemented as well.
! - configure() gets its part of the configuration file. It is up to the enclosing
! component to cut out the right part (using the class below and XPath)
!
!
! 1. XML Configuration
!
! At this time we use a single XML file to form and configure the pipelines.
!
! Configuration is done through a single class that wraps a DOM represenation of
! the XML and facilitates access through XPath.
!
! Currently the interface looks like this:
!
! class Configuration
! {
! Configuration(Reader config);
! Configuration getSubConfig(String xpath);
!
! String getPropertyAsString(String xpath);
! X getPropertyAsX(String xpath);
!
! Node getCurrentNode(); // can we hide this?
! Node getNode(String xpath); // can we hide this?
! NodeList getNodes(String xpath); // can we hide this?
! }
!
! Configuration can resolve strings like ${my.property} to a system property or
! something like $${/my/xpath/} to an xpath expression from the current file.
!
! [Remark]
!
! The LARM main program analyzes the following subsections:
! <properties> <pipes> and <sources>
!
! The properties section is similar to ANT's properties section. Its contents is
! read at startup time. Dependencies are resolved when a property is used (i.e.
! resolved by an underlying component).
!
! The <pipes> and <sources> sections are passed to two global class instances (in
! Avalon they would be called blocks): The config.PropertyManager,
! pipes.PipeManager and the sources.SourceManager.
!
! Each of these classes initializes its subcomponents in the same way they are
! initialized. This is very similar to Avalon's Inversion of Control pattern
! (IoC):
!
! All pipeline classes (PipeManager, SourceManager, Source, MessageProcessor,
! etc.) have or can have a method called "configure(Configuration c)", derived
! from a lifecycle interface called config.Configurable.
!
!
! 2. Startup/Shutdown
!
! LARM gets the path to an XML configuration file as a parameter. Different server
! modes depend on the sources and pipeline configurations in these files.
!
! Startup should be something like
!
! java larm.root.LARM <configfile>
!
! LARM then
! 1. resolves properties in the <properties> section through
! config.PropertyManager
! 2. initialises the pipelines and registers them
! 3. initialises the sources and registers them
! 4. passes the registry of sources and pipes to the classes implementing
! the framework.Contextualizable interface
! 5. calls "configure" on each of the pipelines (through PipeManager.configure())
! 6. calls "configure" on each of the sources (through SourceManager.configure())
! 7. calls "start" on the nonblocking pipes (through PipeManager.start())
! 8. calls "start" on the nonblocking sources (through SourceManager.start())
!
!
! When is LARM shut down? Since pipelines naturally wait for incoming messages,
! this depends on the nature of the Sources and other services. For development we
! will most likely use sources that run through a directory, emit the messages
! contained to a pipeline, and shut down. That means the source may signal that
! the application should exit. Since it is likely that the pipelines are still at
! work, the app will have to wait until all messages are consumed and processed.
!
! [There may be other services that may call for a shutdown: a CTRL-C handler or a
! web service interface.]
!
!
! II. Messaging Framework
!
! LARM basically is concerned with processing pieces of data and moving it along
! what we call a processing pipeline.
!
! The pipeline framework is a set of classes that simplifies this task: It allows
! for a separation of different assembly parts of the whole system. That way ´
! different parts of the pipeline can be put into different classes and can be
! developed rather independently.
!
! In contrast to message-queue systems it is a low-level in-process framework: If
! it is known that only thread is involved, the components need not even be thread
! safe. The aim is to be able to process a very large number of small messages
! very rapidly.
!
!
! 1 Active and Passive Components
!
! Active Components run in their own thread. They may respond to external events
! (socket calls, timer events or whatsoever). Passive Components just provide
! services to other passive or active components. Sources (see below) will mostly
! be active components. That is, they operate the subsequent pipeline.
!
! A MessageProcessor (MP) is a simple class that is called by a pipeline to handle
! a message. It may alter the message, filter it, save it somewhere, etc. It
! either returns null (forming a message sink) or it returns the message (most of
! the time the same message it got, but it may also return a different one).
! Examples of an MP would be a RobotExclusionFilter (filtering some of the URLs
! from the URL list), a PDF to XML converter (reducing PDF to a common metaformat
! that is understood by the indexing component), a FileSystemStorage that saves
! incoming documents on disk, a JMSStorage that saves them to a message queue, or
! a LuceneStorage that adds a document to a Lucene index. An MP could as well
! contain a BlockingPipeline (see below) forming a nested pipeline.
!
! [Is the storage a required part of the pipeline? If so I think we should break
! it up into more distinct pieces to there can be some control programmatically.
! If not is there a required order?]
!
! 2. MessagePipelines
!
! MessageProcessors are put together into message pipelines. There are two types
! of them: BlockingPipelines and NonblockingPipelines.
!
! Pipelines process objects of type Message:
!
! interface Message implements Serializable
! {}
!
! You can see that this is a very generic concept. Its behavior only depends on
! the processor implementations. Messages have to be serializable since they will
! mostly stay on disk.
! Messages should only be data containers and should not contain business logic
! or be dependent on types other than primitive types, Collections, or strings.
! Objects included in the message should form a part-of relationship, no
! referential relationship, since this would make serialization and
! deserialization much more complicated.
!
! Messages are put into pipelines:
!
! interface Pipeline implements MessageProcessor
! {
! public Message processMessage(Message);
! public Message processMessages(Collection); // Collection<Message>
! }
!
! There are two types of pipelines: BlockingPipelines and NonblockingPipelines.
!
! A BlockingPipeline processes a message by calling (at most) all of its
! MessageProcessors in a row. A MessageProcessor gets the Message, may alter it,
! and returns it again. The reference returned is passed to the next processor in
! the row. After the last MP the resulting message is returned.
!
! BlockingPipeline may be designed not to be thread safe (i.e. because it is used
! from within an NonBlockingPipeline and thus only accessed by one thread), as do
! the MPs. (A BlockingPipeline may as well be an MP, which allows for nesting
! pipelines).
!
! A NonBlockingPipeline has an extra thread that handles the messages. Therefore,
! at a processMessage() call, the message is written into a message queue and
! always returns null. The processor thread handles all messages until the queue
! is empty. Internally the AsynchronousProcessor consists of a BlockingPipeline
! that is operated by the ProcessorThread.
!
! The Queue implementation will usually be an in-memory FIFO queue, but may be
! exchanged depending on the needs. A queue may block if it is full.
!
! The MessageProcessor interface looks like this:
!
! interface MessageProcessor
! {
! public Message processMessage(Message);
! }
!
! If you implement a MessageProcessor and need lifecycle methods, you can
! implement one or more of the interfaces larm.config.Configurable,
! larm.framework.Contextualizable, or larm.framework.Startable. The Pipeline will
! take care to call the methods contained in these interfaces in the order as
! specified in section I.
!
! Configuration:
!
! pipeline parameters:
!
! - @name: A global name that identifies the pipe. If existent, the pipeline
! will be registered within the PipelineManager.
! - processors: A block of processors. They are put into the pipeline in the
! order in which they are specified in the configuration.
!
! Additionally, NonblockingPipeline has the following parameters:
!
! - @queueSize: integer. number of messages the queue is able to handle. If there
! are more messages, a call to putMessage() will block until all messages are
! fed into the queue
! - queue (optional): sets a different queue implementation than the default
! larm.pipes.InMemoryQueue.
! Parameter: @type: A class name of type larm.pipes.Queue that is used for the
! queue. May contain config parameters in the block
!
! Example:
!
! <blockingPipeline name="pipe1">
! <processors>
! <processor type="larm.processors.DoNothingProcessor">
! <someArg>someVal</someArg>
! </processor>
! </processors>
! </blockingPipeline>
!
! <nonBlockingPipeline name="pipe2" queueSize="100">
! <queue type="myPackage.myQueue">
! <myQueueParameter/>
! </queue>
! <processors>
! <processor type="larm.processors.DoNothingProcessor">
! <someArg>someVal</someArg>
! </processor>
! <processor type="larm.processors.MyFancyProcessor"/>
! </processors>
! </blockingPipeline>
!
! 3. Sources and Drains
!
! Sources are classes that actively pump new messages into a pipeline. In the
! simplest case a source loads a file given as a parameter, puts it into a
! pipeline, and exits.
!
! Framework provides the following sources:
!
! - FileSource
! reads messages from a given file or a set of files, puts them into the
! pipeline, and exits.
! The file must be a valid batch.
!
! parameters:
! - fileName: The file to read
! - fileSet: a file set like in ANT. Describes files to be put in to
! the queue
! - pipeline: The name of the pipeline
! - delete true/false: delete file(s) after they were put to the pipe.
!
!
! xml config example:
! <fileSource>
! <fileName>c:/larm/test/file1</fileName>
! <pipeline>testPipe</pipeline>
! </fileSource>
!
! example 2:
! <fileSource>
! <fileset dir="c:/larm/test/*.lst"/>
! <pipeline>testPipe</pipeline>
! </fileSource>
!
! - FileMonitorSource
! monitors a set of files given by the fileset parameter and looks for changes
! in the set described by the fileset pattern.
! when new files are found, they are appended to an internal in-memory queue.
! These files are then put into the pipeline given, deleted, and deleted from
! the internal in-memory queue.
!
! parameters:
! - fileset
! - delay: time (in seconds) between runs of the monitor
!
! <fileMonitorSource>
! <fileset dir="somedir"/>
! <pipeline>testPipe</pipeline>
! <delay>30 s</delay>
! </fileMonitorSource>
!
! 4. Notifications or Poll[ing]
!
!
!
! 5. Batch file operation
!
! A batch file contains a set of Objects inherited from the type Message. They are
! read in blocks
!
! [6. Batch file indexing]
!
!
Index: indexer.txt
===================================================================
RCS file: /cvsroot/larm/larm/docs/indexer.txt,v
retrieving revision 1.5
retrieving revision 1.6
diff -C2 -d -r1.5 -r1.6
*** indexer.txt 30 Jun 2003 14:19:36 -0000 1.5
--- indexer.txt 24 Jul 2003 12:18:17 -0000 1.6
***************
*** 1,228 ****
!
! $Id$
!
! -------------------------------------------------------------------------------
! Part IV. The Indexer
! -------------------------------------------------------------------------------
!
! The indexer is a simple component that gets messages of type IndexRecord from a
! queue and outputs them to an index. Our implementation will use a Lucene index
! for this task, although other search engines could be used as well.
!
! Usually the IndexRecords are provided in batches which may reside in files of
! IndexRecord objects. A BatchFileSource can be used to monitor a directory for
! new batch files.
!
! For each IndexRecord, the Indexer gets an IndexRecord that contains the
! following fields:
!
! command: byte: ADD, [UPDATE], or DELETE. Defines if the IndexRecord should be
! added, updated, or deleted from the index. (UPDATE may not be necessary since
! an ADD with the same PrimaryURI may automatically perform an UPDATE)
!
! primaryURI: URI: primary URI of the IndexRecord. If the IndexRecord comes from
! the web or a file system, this is simply the URL. If it represents a tuple
! from a database, the provider has to come up with a URN that forms a primary
! key for the IndexRecord.
!
! Since web documents may be accessible under different URLs a mechanism has to be
! provided to find a primary URL, e.g. by using the one with the highest number of
! inlinks.
!
! In case of ADD or UPDATE the following information has to be provided:
!
! secondaryURIs: Collection: A list of secondary URIs of the IndexRecord. If the
! URI is ambiguous (e.g. if a document is represented by more than one URL) this
!
! MD5Hash: MD5Hash: The MD5 hash of the doc. In case of a recrawl this hash will
! be sent to the gatherer to determine whether the IndexRecords contents have
! changed.
!
! lastChangedDate: Date: The time this indexing has occurred. In case of a crawler
! the time the document was fetched.
!
! documentWeight: float. It is left to the processing pipeline to set this field
! accordingly, e.g. by analyzing the document-link-graph.
!
! MIMEtype: String. The MIME type of the original document
!
! fields: A Collection of <fieldname: String, fieldweight: float, value:
! [LargeText], methods: byte, fieldType: byte> describing the document content.
! They will be indexed as-is. "flags" can be one or more from <INDEXED, STORED,
! TOKENIZED>. fieldType is one of <TEXT, DATE>
!
! The exact contents of these fields is specified through the RecordProcessors.
! Usually they will contain a step in which binary content (PDFs etc) is converted
! to text, a step in which documents are split up into different fields (e.g.
! title, header, headings, body)
!
! The indexer then performs the analysis of different fields and splits the field
! up into index tokens using the standard Lucene analysers infrastructure.
!
! The following shows Java interfaces for the type described. Remarks show a
! possible implementation using J2SDK 1.5 (Tiger):
!
! interface IndexRecord implements Message
! {
! // enum Command
! final static byte CMD_ADD = (byte)'a';
! final static byte CMD_UPDATE = (byte)'u';
! final static byte CMD_DELETE = (byte)'d';
!
! byte command; // type: Co...
[truncated message content] |
|
From: <ot...@us...> - 2003-07-24 12:18:20
|
Update of /cvsroot/larm/larm In directory sc8-pr-cvs1:/tmp/cvs-serv31326 Modified Files: README.txt update.website.txt Log Message: - Updated. Index: README.txt =================================================================== RCS file: /cvsroot/larm/larm/README.txt,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** README.txt 24 Jun 2003 17:22:05 -0000 1.2 --- README.txt 24 Jul 2003 12:18:17 -0000 1.3 *************** *** 1,27 **** ! ! $Id$ ! ! LARM - Lucene Advanced Retrieval Machine. ! ! ! Please read the docs in docs/ first, starting with contents.txt ! ! ! You can use ANT to build the project. Simply call ! ! ant ! ! and LARM will build into ./build. The first time it builds it will look for ! external libraries like HTTPClient and download them if necessary. ! ! LARM was written using the Eclipse IDE. As soon as the libraries are downloaded, ! You can import the workspace simply by importing the checked out directory into ! the eclipse workspace. Eclipse will build the project into the ./build directory. ! ! To run LARM you will be able to use larm.root.LARM. As soon as this is written ! you can call ! ! java -classpath ./build/classes:[other libs] larm.root.LARM <configfile> ! ! We will provide config files for various applications. ! --- 1,40 ---- ! ! $Id$ ! ! LARM - Lucene Advanced Retrieval Machine. ! ! Please read the docs in docs/ first, starting with contents.txt ! ! Prerequisites: You need ! - Java 2 SDK starting from 1.4.0 ! - ANT (see http://jakarta.apache.org/ant) ! ! You can use ANT to build the project. Simply call ! ! ant ! ! and LARM will build into ./build. [The first time it builds it will look for ! external libraries like HTTPClient and downloads them if necessary.] ! ! LARM was written using the Eclipse IDE. As soon as the libraries are downloaded, ! You can import the workspace simply by importing the checked out directory into ! the eclipse workspace. Eclipse will build the project into the ./build directory. ! ! To run LARM you will be able to use larm.root.LARM. ! ! You can use ! ! larm.bat empty ! or ! larm.sh empty ! ! to run LARM with an empty test pipeline in src/config/empty.xml. ! This is just for testing purposes. ! ! We will provide config files for various applications in src/config. Runtime ! arguments will have to be passed as system properties or in a properties file ! that is referenced from the config file. ! ! ! ! Index: update.website.txt =================================================================== RCS file: /cvsroot/larm/larm/update.website.txt,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** update.website.txt 30 Jun 2003 14:17:25 -0000 1.1 --- update.website.txt 24 Jul 2003 12:18:17 -0000 1.2 *************** *** 1,8 **** ! In order to update the website... ! ! - check out the module "website" from LARM's CVS ! - edit the files in htdocs/ ! - commit the changes ! - log in via SSH to shell.sourceforge.net ! - cd to /home/groups/l/la/larm ! - perform a cvs -d :ext:<user>@cvs1:/cvsroot/larm co website --- 1,8 ---- ! In order to update the website... ! ! - check out the module "website" from LARM's CVS ! - edit the files in htdocs/ ! - commit the changes ! - log in via SSH to shell.sourceforge.net ! - cd to /home/groups/l/la/larm ! - perform a cvs -d :ext:<user>@cvs1:/cvsroot/larm co website |
|
From: <ot...@us...> - 2003-07-24 11:48:19
|
Update of /cvsroot/larm/larm/src/java/larm/config
In directory sc8-pr-cvs1:/tmp/cvs-serv26943/src/java/larm/config
Added Files:
PropertyManager.java
Log Message:
- Initial checkin.
--- NEW FILE: PropertyManager.java ---
/*
*
* $Id: PropertyManager.java,v 1.1 2003/07/24 11:48:08 otis Exp $
*/
package larm.config;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Properties;
import java.util.logging.Level;
import java.util.logging.Logger;
import larm.framework.config.*;
/**
* PropertyManager
*
* @author
*/
public class PropertyManager implements Configurable
{
static Logger log = Logger.getLogger(PropertyManager.class.getName());;
public PropertyManager()
{
}
/**
* @see larm.config.Configurable#configure(larm.config.Configuration)
*/
public void configure(Configuration conf)
{
log.info("configuring PropertyManager");
ConfigList props = conf.getSubConfigList("property[@name and @value]");
for(int i = 0; i < props.length(); i++)
{
Configuration p = props.item(i);
String name = p.getPropertyAsStringDontResolve("@name");
String value = p.getPropertyAsStringDontResolve("@value");
log.info("found property: " + name + "=" + value);
System.setProperty(name, value);
}
props = conf.getSubConfigList("property[@file]");
for(int i = 0; i < props.length(); i++)
{
String fileName=null;
try
{
Configuration p = props.item(i);
fileName = p.getProperty("@file");
log.info("found property file: " + fileName);
Properties properties = new Properties();
properties.load(new FileInputStream(fileName));
System.setProperties(properties);
}
catch(FileNotFoundException e)
{
log.config("Could not find property file '" + fileName + "'");
}
catch(IOException e)
{
log.log(Level.SEVERE, "I/O Exception while opening property file " + fileName, e);
}
}
log.exiting("","configure");
}
}
|