From: <lor...@us...> - 2011-05-13 09:21:30
|
Revision: 2803 http://dl-learner.svn.sourceforge.net/dl-learner/?rev=2803&view=rev Author: lorenz_b Date: 2011-05-13 09:21:24 +0000 (Fri, 13 May 2011) Log Message: ----------- Added new NER using Lingpipe API and a local DBpedia dictionary. Modified Paths: -------------- trunk/components-ext/pom.xml trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/NERTest.java Added Paths: ----------- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/LingPipeNER.java trunk/components-ext/src/main/resources/tbsl/models/dbpedia_lingpipe.dictionary Modified: trunk/components-ext/pom.xml =================================================================== --- trunk/components-ext/pom.xml 2011-05-13 08:19:14 UTC (rev 2802) +++ trunk/components-ext/pom.xml 2011-05-13 09:21:24 UTC (rev 2803) @@ -14,6 +14,10 @@ <id>Simmetrics</id> <url>http://maven.mse.jhu.edu/m2repository/</url> </repository> + <repository> + <id>Harvard Med</id> + <url>http://repo.open.med.harvard.edu/nexus/content/repositories/public/</url> + </repository> </repositories> <parent> @@ -106,6 +110,11 @@ <artifactId>opennlp-maxent</artifactId> <version>3.0.1-incubating</version> </dependency> + <dependency> + <groupId>com.aliasi</groupId> + <artifactId>lingpipe</artifactId> + <version>4.0.1</version> + </dependency> </dependencies> </project> Added: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/LingPipeNER.java =================================================================== --- trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/LingPipeNER.java (rev 0) +++ trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/LingPipeNER.java 2011-05-13 09:21:24 UTC (rev 2803) @@ -0,0 +1,51 @@ +package org.dllearner.algorithm.tbsl.nlp; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import com.aliasi.chunk.Chunk; +import com.aliasi.chunk.Chunker; +import com.aliasi.chunk.Chunking; +import com.aliasi.dict.Dictionary; +import com.aliasi.dict.ExactDictionaryChunker; +import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory; +import com.aliasi.util.AbstractExternalizable; + +public class LingPipeNER implements NER{ + + private static final String DICTIONARY_PATH = "src/main/resources/tbsl/models/dbpedia_lingpipe.dictionary"; + + private Chunker ner; + + public LingPipeNER() { + this(true, true); + } + + public LingPipeNER(boolean caseSensitive) { + this(caseSensitive, true); + } + + public LingPipeNER(boolean caseSensitive, boolean allMatches) { + try { + Dictionary<String> dictionary = (Dictionary<String>) AbstractExternalizable.readObject(new File(DICTIONARY_PATH)); + ner = new ExactDictionaryChunker(dictionary, IndoEuropeanTokenizerFactory.INSTANCE, allMatches, caseSensitive); + } catch (IOException e) { + e.printStackTrace(); + } catch (ClassNotFoundException e) { + e.printStackTrace(); + } + } + + @Override + public List<String> getNamedEntitites(String sentence) { + List<String> namedEntities = new ArrayList<String>(); + Chunking chunking = ner.chunk(sentence); + for(Chunk chunk : chunking.chunkSet()){ + namedEntities.add(sentence.substring(chunk.start(), chunk.end())); + } + return namedEntities; + } + +} Property changes on: trunk/components-ext/src/main/java/org/dllearner/algorithm/tbsl/nlp/LingPipeNER.java ___________________________________________________________________ Added: svn:mime-type + text/plain Added: trunk/components-ext/src/main/resources/tbsl/models/dbpedia_lingpipe.dictionary =================================================================== (Binary files differ) Property changes on: trunk/components-ext/src/main/resources/tbsl/models/dbpedia_lingpipe.dictionary ___________________________________________________________________ Added: svn:mime-type + application/octet-stream Modified: trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/NERTest.java =================================================================== --- trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/NERTest.java 2011-05-13 08:19:14 UTC (rev 2802) +++ trunk/components-ext/src/test/java/org/dllearner/algorithm/tbsl/NERTest.java 2011-05-13 09:21:24 UTC (rev 2803) @@ -1,6 +1,9 @@ package org.dllearner.algorithm.tbsl; +import java.util.List; + import org.dllearner.algorithm.tbsl.nlp.DBpediaSpotlightNER; +import org.dllearner.algorithm.tbsl.nlp.LingPipeNER; import org.dllearner.algorithm.tbsl.nlp.NER; public class NERTest { @@ -12,8 +15,17 @@ String sentence = "When did Nirvana record Nevermind?"; NER ner = new DBpediaSpotlightNER(); - System.out.println(ner.getNamedEntitites(sentence)); + long startTime = System.currentTimeMillis(); + List<String> namedEntities = ner.getNamedEntitites(sentence); + System.out.format("Using DBpedia Spotlight WebService (%d ms):\n", System.currentTimeMillis()-startTime); + System.out.println(namedEntities + "\n"); + ner = new LingPipeNER(); + startTime = System.currentTimeMillis(); + namedEntities = ner.getNamedEntitites(sentence); + System.out.format("Using Lingpipe API with local DBpedia dictionary (%d ms):\n", System.currentTimeMillis()-startTime); + System.out.println(namedEntities); + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |