[DL-Learner SVN] SF.net SVN: dl-learner:[4164] trunk/components-core/src/main/java/org/ dllearner/a

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 4164
          http://sourceforge.net/p/dl-learner/code/4164
Author:   lorenz_b
Date:     2013-11-21 13:38:03 +0000 (Thu, 21 Nov 2013)
Log Message:
-----------
Refactored context extractors.

Modified Paths:
--------------
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WindowBasedContextExtractor.java

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java
===================================================================

--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java	2013-11-21 13:16:13 UTC (rev 4163)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java	2013-11-21 13:38:03 UTC (rev 4164)
@@ -37,7 +37,7 @@
 		return instance;
 	}
 
-	public TextDocument tag(String text) {
+	public TextDocument generateDocument(String text) {
 		TextDocument document = new TextDocument();
 	    // create an empty Annotation just with the given text
 	    Annotation annotatedDocument = new Annotation(text);
@@ -72,7 +72,7 @@
 	}
 	
 	public static void main(String[] args) throws Exception {
-		TextDocument document = TextDocumentGenerator.getInstance().tag("And he said, Amos, what seest thou? And I said, A basket of summer fruit. Then said the LORD unto me, The end is come upon my people of Israel; I will not again pass by them any more. ");
+		TextDocument document = TextDocumentGenerator.getInstance().generateDocument("And he said, Amos, what seest thou? And I said, A basket of summer fruit. Then said the LORD unto me, The end is come upon my people of Israel; I will not again pass by them any more. ");
 		System.out.println(document);
 	}
 }

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java	2013-11-21 13:16:13 UTC (rev 4163)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java	2013-11-21 13:38:03 UTC (rev 4164)
@@ -34,6 +34,13 @@
 	public Document getReferencedDocument() {
 		return referencedDocument;
 	}
+	
+	/**
+	 * @return the tokens
+	 */
+	public ArrayList<Token> getTokens() {
+		return tokens;
+	}
 
 	public String getString(){
         StringBuilder sb = new StringBuilder();

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java	2013-11-21 13:16:13 UTC (rev 4163)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java	2013-11-21 13:38:03 UTC (rev 4164)
@@ -16,6 +16,7 @@
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.util.Version;
+import org.dllearner.algorithms.isle.TextDocumentGenerator;
 import org.dllearner.algorithms.isle.index.TextDocument;
 
 import java.io.File;
@@ -61,7 +62,7 @@
 			ScoreDoc[] result = searcher.search(query, getSize()).scoreDocs;
 			for (int i = 0; i < result.length; i++) {
 				Document doc = searcher.doc(result[i].doc);
-				documents.add(new TextDocument(doc.get(searchField)));
+				documents.add(TextDocumentGenerator.getInstance().generateDocument(doc.get(searchField)));
 			}
 		} catch (ParseException e) {
 			e.printStackTrace();
@@ -85,7 +86,7 @@
 			try {
 				Document doc = indexReader.document(i);
 				String content = doc.get(searchField);
-				documents.add(new TextDocument(content));
+				documents.add(TextDocumentGenerator.getInstance().generateDocument(content));
 			} catch (IOException e) {
 				e.printStackTrace();
 			}

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java	2013-11-21 13:16:13 UTC (rev 4163)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java	2013-11-21 13:38:03 UTC (rev 4164)
@@ -3,6 +3,14 @@
  */
 package org.dllearner.algorithms.isle.wsd;
 
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Properties;
+
+import org.dllearner.algorithms.isle.TextDocumentGenerator;
+import org.dllearner.algorithms.isle.index.Token;
+
 import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
 import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
 import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
@@ -10,12 +18,7 @@
 import edu.stanford.nlp.pipeline.Annotation;
 import edu.stanford.nlp.pipeline.StanfordCoreNLP;
 import edu.stanford.nlp.util.CoreMap;
-import org.dllearner.algorithms.isle.index.TextDocument;
 
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Properties;
-
 /**
  * @author Lorenz Buehmann
  *
@@ -36,26 +39,29 @@
 	@Override
 	public List<String> extractContext(org.dllearner.algorithms.isle.index.Annotation annotation) {
 		//split text into sentences
-		List<CoreMap> sentences = getSentences(annotation.getReferencedDocument().getContent());
+		List<CoreMap> sentences = getSentences(annotation.getReferencedDocument().getRawContent());
 
 		//find the sentence containing the token of the annotation
-		int tokenStart = annotation.getOffset();
-		int index = 0;
+		Token firstToken = annotation.getTokens().get(0);
 		for (CoreMap sentence : sentences) {
-			String s = sentence.toString();
-			if (index <= tokenStart && s.length() > tokenStart) {
+			boolean found = false;
+			for (CoreLabel label : sentence.get(TokensAnnotation.class)) {
+				// this is the text of the token
+				String word = label.get(TextAnnotation.class);
+				if(word.equals(firstToken.getRawForm())){
+					found = true;
+					break;
+				}
+			}
+			if(found){
 				List<String> context = new ArrayList<String>();
 				for (CoreLabel label : sentence.get(TokensAnnotation.class)) {
 					// this is the text of the token
 					String word = label.get(TextAnnotation.class);
-					
-					if(!word.isEmpty() && !word.matches("\\p{Punct}")){
-						context.add(word);
-					}
+					context.add(word);
 				}
 				return context;
 			}
-			index += s.length();
 		}
 		throw new RuntimeException("Token " + annotation.getString() + " not found in text " + annotation.getReferencedDocument().getRawContent());
 	}
@@ -79,9 +85,8 @@
 		String s = "International Business Machines Corporation, or IBM, is an American multinational services technology and consulting corporation, with headquarters in Armonk, New York, United States. IBM manufactures and markets computer hardware and software,"
 				+ " and offers infrastructure, hosting and consulting services in areas ranging from mainframe computers to nanotechnology.";
 	
-		String token = "services";
 		SentenceBasedContextExtractor extractor = new SentenceBasedContextExtractor();
-		List<String> context = extractor.extractContext(new org.dllearner.algorithms.isle.index.Annotation(new TextDocument(s), s.indexOf(token), token.length()));
+		List<String> context = extractor.extractContext(new org.dllearner.algorithms.isle.index.Annotation(TextDocumentGenerator.getInstance().generateDocument(s), Arrays.asList(new Token("American"))));
 		System.out.println(context);
 	}
 

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WindowBasedContextExtractor.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WindowBasedContextExtractor.java	2013-11-21 13:16:13 UTC (rev 4163)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WindowBasedContextExtractor.java	2013-11-21 13:38:03 UTC (rev 4164)
@@ -3,6 +3,14 @@
  */
 package org.dllearner.algorithms.isle.wsd;
 
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Properties;
+
+import org.dllearner.algorithms.isle.TextDocumentGenerator;
+import org.dllearner.algorithms.isle.index.Token;
+
 import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
 import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
 import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
@@ -10,18 +18,13 @@
 import edu.stanford.nlp.pipeline.Annotation;
 import edu.stanford.nlp.pipeline.StanfordCoreNLP;
 import edu.stanford.nlp.util.CoreMap;
-import org.dllearner.algorithms.isle.index.TextDocument;
 
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Properties;
-
 /**
  * @author Lorenz Buehmann
- *
+ * 
  */
-public class WindowBasedContextExtractor implements ContextExtractor{
-	
+public class WindowBasedContextExtractor implements ContextExtractor {
+
 	private StanfordCoreNLP pipeline;
 	private int tokensLeft = 10;
 	private int tokensRight = 10;
@@ -29,57 +32,66 @@
 	public WindowBasedContextExtractor(int tokensLeft, int tokensRight) {
 		this.tokensLeft = tokensLeft;
 		this.tokensRight = tokensRight;
-		
+
 		Properties props = new Properties();
 		props.put("annotators", "tokenize, ssplit");
 		pipeline = new StanfordCoreNLP(props);
 	}
-	
+
 	public WindowBasedContextExtractor(int tokensLeftRight) {
 		tokensLeft = tokensLeftRight;
 		tokensRight = tokensLeftRight;
-		
+
 		Properties props = new Properties();
 		props.put("annotators", "tokenize, ssplit");
 		pipeline = new StanfordCoreNLP(props);
 	}
-	
+
 	public WindowBasedContextExtractor() {
 		Properties props = new Properties();
 		props.put("annotators", "tokenize, ssplit");
 		pipeline = new StanfordCoreNLP(props);
 	}
 
-	/* (non-Javadoc)
-	 * @see org.dllearner.algorithms.isle.wsd.ContextExtractor#extractContext(java.lang.String, java.lang.String)
+	/*
+	 * (non-Javadoc)
+	 * 
+	 * @see
+	 * org.dllearner.algorithms.isle.wsd.ContextExtractor#extractContext(java
+	 * .lang.String, java.lang.String)
 	 */
 	@Override
 	public List<String> extractContext(org.dllearner.algorithms.isle.index.Annotation annotation) {
 		// split text into sentences
-		List<CoreMap> sentences = getSentences(annotation.getReferencedDocument().getContent());
+		List<CoreMap> sentences = getSentences(annotation.getReferencedDocument().getRawContent());
 
 		// find the sentence containing the token of the annotation
-		int tokenStart = annotation.getOffset();
-		int index = 0;
+		Token firstToken = annotation.getTokens().get(0);
 		for (CoreMap sentence : sentences) {
-			String s = sentence.toString();
-			if (index <= tokenStart && s.length() > tokenStart) {
+			boolean found = false;
+			for (CoreLabel label : sentence.get(TokensAnnotation.class)) {
+				// this is the text of the token
+				String word = label.get(TextAnnotation.class);
+				if (word.equals(firstToken.getRawForm())) {
+					found = true;
+					break;
+				}
+			}
+			if (found) {
 				List<String> context = new ArrayList<String>();
 				for (CoreLabel label : sentence.get(TokensAnnotation.class)) {
 					// this is the text of the token
 					String word = label.get(TextAnnotation.class);
-
 					context.add(word);
 				}
 				return context;
 			}
-			index += s.length();
 		}
-		throw new RuntimeException("Token " + annotation + " not found in text "
-				+ annotation.getReferencedDocument().getContent());
+		throw new RuntimeException("Token " + annotation.getString() + " not found in text "
+				+ annotation.getReferencedDocument().getRawContent());
 
 	}
-	
+
 	private List<CoreMap> getSentences(String document) {
 		// create an empty Annotation just with the given text
 		Annotation annotation = new Annotation(document);
@@ -94,14 +106,14 @@
 
 		return sentences;
 	}
-	
+
 	public static void main(String[] args) throws Exception {
 		String s = "International Business Machines Corporation, or IBM, is an American multinational services technology and consulting corporation, with headquarters in Armonk, New York, United States. IBM manufactures and markets computer hardware and software,"
 				+ " and offers infrastructure, hosting and consulting services in areas ranging from mainframe computers to nanotechnology.";
-	
+
 		String token = "services";
 		WindowBasedContextExtractor extractor = new WindowBasedContextExtractor();
-		List<String> context = extractor.extractContext(new org.dllearner.algorithms.isle.index.Annotation(new TextDocument(s), s.indexOf(token), token.length()));
+		List<String> context = extractor.extractContext(new org.dllearner.algorithms.isle.index.Annotation(TextDocumentGenerator.getInstance().generateDocument(s), Arrays.asList(new Token("American"))));
 		System.out.println(context);
 	}
 

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





[DL-Learner SVN] SF.net SVN: dl-learner:[4164] trunk/components-core/src/main/java/org/ dllearner/a

[DL-Learner SVN] SF.net SVN: dl-learner:[4164] trunk/components-core/src/main/java/org/ dllearner/algorithms/isle