[DL-Learner SVN] SF.net SVN: dl-learner:[4107] trunk/components-core/src/main/java/org/ dllearner/a

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 4107
          http://sourceforge.net/p/dl-learner/code/4107
Author:   andremelo
Date:     2013-09-10 15:52:48 +0000 (Tue, 10 Sep 2013)
Log Message:
-----------
- Adding the method to EntitityCandidateGenerator interface: HashMap<Annotation,Set<Entity>> getCandidatesMap(Set<Annotation> annotations)
- Adding first version of the postprocessing from the trie implementation

Modified Paths:
--------------
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityCandidateGenerator.java
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidateGenerator.java
    trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityCandidateGenerator.java
===================================================================

--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityCandidateGenerator.java	2013-09-10 15:49:18 UTC (rev 4106)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityCandidateGenerator.java	2013-09-10 15:52:48 UTC (rev 4107)
@@ -3,6 +3,7 @@
  */
 package org.dllearner.algorithms.isle;
 
+import java.util.HashMap;
 import java.util.Set;
 
 import org.dllearner.algorithms.isle.index.Annotation;
@@ -22,4 +23,7 @@
 	}
 
 	public abstract Set<Entity> getCandidates(Annotation annotation);
+	
+
+	public abstract HashMap<Annotation,Set<Entity>> getCandidatesMap(Set<Annotation> annotations);
 }

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java	2013-09-10 15:49:18 UTC (rev 4106)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java	2013-09-10 15:52:48 UTC (rev 4107)
@@ -1,5 +1,6 @@
 package org.dllearner.algorithms.isle.index;
 
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Set;
 
@@ -40,8 +41,9 @@
     public AnnotatedDocument processDocument(TextDocument document){
     	Set<Annotation> annotations = linguisticAnnotator.annotate(document);
     	Set<SemanticAnnotation> semanticAnnotations = new HashSet<SemanticAnnotation>();
-    	for (Annotation annotation : annotations) {
-    		Set<Entity> candidateEntities = entityCandidateGenerator.getCandidates(annotation);
+    	HashMap<Annotation,Set<Entity>> candidatesMap = entityCandidateGenerator.getCandidatesMap(annotations);
+    	for (Annotation annotation : candidatesMap.keySet()) {
+    		Set<Entity> candidateEntities = candidatesMap.get(annotation);
             if (candidateEntities == null || candidateEntities.size() == 0) {
                 continue;
             }

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidateGenerator.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidateGenerator.java	2013-09-10 15:49:18 UTC (rev 4106)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidateGenerator.java	2013-09-10 15:52:48 UTC (rev 4107)
@@ -3,6 +3,7 @@
  */
 package org.dllearner.algorithms.isle.index;
 
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Set;
 
@@ -39,4 +40,13 @@
 		return allEntities;
 	}
 
+	@Override
+	public HashMap<Annotation, Set<Entity>> getCandidatesMap(Set<Annotation> annotations) {
+		HashMap<Annotation, Set<Entity>> result = new HashMap<Annotation, Set<Entity>>();
+		for (Annotation annotation: annotations) 
+			result.put(annotation, getCandidates(annotation));
+		
+		return result;
+	}
+
 }

Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java
===================================================================
--- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java	2013-09-10 15:49:18 UTC (rev 4106)
+++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java	2013-09-10 15:52:48 UTC (rev 4107)
@@ -1,11 +1,24 @@
 package org.dllearner.algorithms.isle.index;
 
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
 import java.util.Set;
+import java.util.regex.Pattern;
 
 import org.dllearner.algorithms.isle.EntityCandidateGenerator;
+import org.dllearner.algorithms.isle.StopWordFilter;
 import org.dllearner.core.owl.Entity;
 import org.semanticweb.owlapi.model.OWLOntology;
 
+import cern.colt.Arrays;
+import cern.colt.list.AbstractCollection;
+
+import edu.stanford.nlp.util.Sets;
+
 /**
  * Generates candidates using a entity candidates prefix trie
  * @author Andre Melo
@@ -13,7 +26,9 @@
  */
 public class TrieEntityCandidateGenerator extends EntityCandidateGenerator{
 
-	EntityCandidatesTrie candidatesTrie;
+	final EntityCandidatesTrie candidatesTrie;
+	final StopWordFilter stopWordFilter = new StopWordFilter();
+	int window = 10;
 	
 	public TrieEntityCandidateGenerator(OWLOntology ontology, EntityCandidatesTrie candidatesTrie) {
 		super(ontology);
@@ -24,4 +39,103 @@
 		return candidatesTrie.getCandidateEntities(annotation.getToken());
 	}
 
+    /**
+     * Postprocess the annotations generated by annotate
+     * The objective is to merge annotations which are likely to belong to the same entity
+     * @param annotations : set of annotations
+     * @param window : maximum distance between the annotations
+     * @return
+     */
+    public void postProcess(HashMap<Annotation,Set<Entity>> candidatesMap, int window, StopWordFilter stopWordFilter) {
+    	Set<Annotation> annotations = candidatesMap.keySet();
+    	List<Annotation> sortedAnnotations = new ArrayList<Annotation>(annotations);
+    	
+    	// Sort annotations by offset in ascending order
+    	Collections.sort(sortedAnnotations, new Comparator<Annotation>(){
+            public int compare(Annotation a1,Annotation a2){
+                return Integer.compare(a1.getOffset(), a2.getOffset());
+            }
+    	});
+    	
+    	int windowStart = 0;
+    	int windowEnd = 0;
+    	for (int i=0; i<sortedAnnotations.size(); i++) {
+    		
+    		Annotation annotation_i = sortedAnnotations.get(i);
+    		int begin_i = annotation_i.getOffset();
+    		int end_i = begin_i + annotation_i.getLength()-1;
+    		String token_i = annotation_i.getToken();
+    		Set<Entity> candidates_i = getCandidates(annotation_i);
+    		Set<Entity> newCandidates_i = new HashSet<Entity>();
+    		
+    		// Determine the annotations contained in the window
+    		while ((sortedAnnotations.get(windowStart).getOffset()+sortedAnnotations.get(windowStart).getLength()-1)<(begin_i-window))
+    			windowStart++;
+    		while (windowEnd<sortedAnnotations.size() && sortedAnnotations.get(windowEnd).getOffset()<(end_i+window))
+    			windowEnd++;
+    		
+    		// For every annotation in the window (defined by the number of characters between offsets)
+    		for (int j=windowStart; j<sortedAnnotations.size() && j<windowEnd; j++) {
+    			if (j!=i) {
+	    			Annotation annotation_j = sortedAnnotations.get(j);
+	    			String token_j = annotation_j.getToken();
+	    			Set<Entity> candidates_j = getCandidates(annotation_j);
+	    			Set<Entity> intersection = Sets.intersection(candidates_i, candidates_j);
+	    			Set<Entity> newCandidates_ij = new HashSet<Entity>();
+	    			for (Entity commonEntity: intersection) {
+	    				if (!(stopWordFilter.isStopWord(token_i) && stopWordFilter.isStopWord(token_j))) {
+		    				if (!token_i.contains(token_j) && !token_j.contains(token_i)) {
+		    					newCandidates_ij.add(commonEntity);
+		    					//System.out.println("common("+token_i+","+token_j+")="+commonEntity);
+		    				}
+	    				}
+	    			}
+	    			if (!newCandidates_ij.isEmpty()) {
+	    				Annotation mergedAnnotation = mergeAnnotations(annotation_i,annotation_j);
+	    				// If there's no punctuation in the merged annotation
+	    				if (!Pattern.matches("\\p{Punct}", mergedAnnotation.getToken())) {
+		    				candidatesMap.put(mergedAnnotation, newCandidates_ij);
+		    				candidatesMap.remove(annotation_i);
+		    				candidatesMap.remove(annotation_j);
+	    				}
+	    				
+	    				newCandidates_i.addAll(newCandidates_ij);
+	    			}
+    			}
+    		}
+    		
+    		// Deletes annotation if it's a stop word and doesn't have any matching annotation in the window
+    		if (stopWordFilter.isStopWord(token_i)) {
+    			if (newCandidates_i.isEmpty())
+    				candidatesMap.remove(annotation_i);	
+    		}
+    	}
+    	
+    	
+    	
+    }
+
+	private Annotation mergeAnnotations(Annotation annotation_i, Annotation annotation_j) {
+		int offset;
+		int length;
+		if (annotation_i.getOffset() < annotation_j.getOffset()) {
+			offset = annotation_i.getOffset();
+			length = annotation_j.getOffset() - offset + annotation_j.getLength(); 
+		} else {
+			offset = annotation_j.getOffset();
+			length = annotation_i.getOffset() - offset + annotation_i.getLength();
+		}
+		return new Annotation(annotation_i.getReferencedDocument(), offset, length);
+	}
+
+	@Override
+	public HashMap<Annotation, Set<Entity>> getCandidatesMap(Set<Annotation> annotations) {
+		HashMap<Annotation, Set<Entity>> candidatesMap = new HashMap<Annotation, Set<Entity>>();
+		for (Annotation annotation: annotations) 
+			candidatesMap.put(annotation, getCandidates(annotation));
+		
+		postProcess(candidatesMap, window, stopWordFilter);
+		
+		return candidatesMap;
+	}
 }

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





[DL-Learner SVN] SF.net SVN: dl-learner:[4107] trunk/components-core/src/main/java/org/ dllearner/a

[DL-Learner SVN] SF.net SVN: dl-learner:[4107] trunk/components-core/src/main/java/org/ dllearner/algorithms/isle