From: <and...@us...> - 2013-09-10 15:52:52
|
Revision: 4107 http://sourceforge.net/p/dl-learner/code/4107 Author: andremelo Date: 2013-09-10 15:52:48 +0000 (Tue, 10 Sep 2013) Log Message: ----------- - Adding the method to EntitityCandidateGenerator interface: HashMap<Annotation,Set<Entity>> getCandidatesMap(Set<Annotation> annotations) - Adding first version of the postprocessing from the trie implementation Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityCandidateGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidateGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityCandidateGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityCandidateGenerator.java 2013-09-10 15:49:18 UTC (rev 4106) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityCandidateGenerator.java 2013-09-10 15:52:48 UTC (rev 4107) @@ -3,6 +3,7 @@ */ package org.dllearner.algorithms.isle; +import java.util.HashMap; import java.util.Set; import org.dllearner.algorithms.isle.index.Annotation; @@ -22,4 +23,7 @@ } public abstract Set<Entity> getCandidates(Annotation annotation); + + + public abstract HashMap<Annotation,Set<Entity>> getCandidatesMap(Set<Annotation> annotations); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java 2013-09-10 15:49:18 UTC (rev 4106) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java 2013-09-10 15:52:48 UTC (rev 4107) @@ -1,5 +1,6 @@ package org.dllearner.algorithms.isle.index; +import java.util.HashMap; import java.util.HashSet; import java.util.Set; @@ -40,8 +41,9 @@ public AnnotatedDocument processDocument(TextDocument document){ Set<Annotation> annotations = linguisticAnnotator.annotate(document); Set<SemanticAnnotation> semanticAnnotations = new HashSet<SemanticAnnotation>(); - for (Annotation annotation : annotations) { - Set<Entity> candidateEntities = entityCandidateGenerator.getCandidates(annotation); + HashMap<Annotation,Set<Entity>> candidatesMap = entityCandidateGenerator.getCandidatesMap(annotations); + for (Annotation annotation : candidatesMap.keySet()) { + Set<Entity> candidateEntities = candidatesMap.get(annotation); if (candidateEntities == null || candidateEntities.size() == 0) { continue; } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidateGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidateGenerator.java 2013-09-10 15:49:18 UTC (rev 4106) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidateGenerator.java 2013-09-10 15:52:48 UTC (rev 4107) @@ -3,6 +3,7 @@ */ package org.dllearner.algorithms.isle.index; +import java.util.HashMap; import java.util.HashSet; import java.util.Set; @@ -39,4 +40,13 @@ return allEntities; } + @Override + public HashMap<Annotation, Set<Entity>> getCandidatesMap(Set<Annotation> annotations) { + HashMap<Annotation, Set<Entity>> result = new HashMap<Annotation, Set<Entity>>(); + for (Annotation annotation: annotations) + result.put(annotation, getCandidates(annotation)); + + return result; + } + } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-09-10 15:49:18 UTC (rev 4106) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-09-10 15:52:48 UTC (rev 4107) @@ -1,11 +1,24 @@ package org.dllearner.algorithms.isle.index; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; import java.util.Set; +import java.util.regex.Pattern; import org.dllearner.algorithms.isle.EntityCandidateGenerator; +import org.dllearner.algorithms.isle.StopWordFilter; import org.dllearner.core.owl.Entity; import org.semanticweb.owlapi.model.OWLOntology; +import cern.colt.Arrays; +import cern.colt.list.AbstractCollection; + +import edu.stanford.nlp.util.Sets; + /** * Generates candidates using a entity candidates prefix trie * @author Andre Melo @@ -13,7 +26,9 @@ */ public class TrieEntityCandidateGenerator extends EntityCandidateGenerator{ - EntityCandidatesTrie candidatesTrie; + final EntityCandidatesTrie candidatesTrie; + final StopWordFilter stopWordFilter = new StopWordFilter(); + int window = 10; public TrieEntityCandidateGenerator(OWLOntology ontology, EntityCandidatesTrie candidatesTrie) { super(ontology); @@ -24,4 +39,103 @@ return candidatesTrie.getCandidateEntities(annotation.getToken()); } + /** + * Postprocess the annotations generated by annotate + * The objective is to merge annotations which are likely to belong to the same entity + * @param annotations : set of annotations + * @param window : maximum distance between the annotations + * @return + */ + public void postProcess(HashMap<Annotation,Set<Entity>> candidatesMap, int window, StopWordFilter stopWordFilter) { + Set<Annotation> annotations = candidatesMap.keySet(); + List<Annotation> sortedAnnotations = new ArrayList<Annotation>(annotations); + + // Sort annotations by offset in ascending order + Collections.sort(sortedAnnotations, new Comparator<Annotation>(){ + public int compare(Annotation a1,Annotation a2){ + return Integer.compare(a1.getOffset(), a2.getOffset()); + } + }); + + int windowStart = 0; + int windowEnd = 0; + for (int i=0; i<sortedAnnotations.size(); i++) { + + Annotation annotation_i = sortedAnnotations.get(i); + int begin_i = annotation_i.getOffset(); + int end_i = begin_i + annotation_i.getLength()-1; + String token_i = annotation_i.getToken(); + Set<Entity> candidates_i = getCandidates(annotation_i); + Set<Entity> newCandidates_i = new HashSet<Entity>(); + + // Determine the annotations contained in the window + while ((sortedAnnotations.get(windowStart).getOffset()+sortedAnnotations.get(windowStart).getLength()-1)<(begin_i-window)) + windowStart++; + while (windowEnd<sortedAnnotations.size() && sortedAnnotations.get(windowEnd).getOffset()<(end_i+window)) + windowEnd++; + + // For every annotation in the window (defined by the number of characters between offsets) + for (int j=windowStart; j<sortedAnnotations.size() && j<windowEnd; j++) { + if (j!=i) { + Annotation annotation_j = sortedAnnotations.get(j); + String token_j = annotation_j.getToken(); + Set<Entity> candidates_j = getCandidates(annotation_j); + Set<Entity> intersection = Sets.intersection(candidates_i, candidates_j); + Set<Entity> newCandidates_ij = new HashSet<Entity>(); + for (Entity commonEntity: intersection) { + if (!(stopWordFilter.isStopWord(token_i) && stopWordFilter.isStopWord(token_j))) { + if (!token_i.contains(token_j) && !token_j.contains(token_i)) { + newCandidates_ij.add(commonEntity); + //System.out.println("common("+token_i+","+token_j+")="+commonEntity); + } + } + } + if (!newCandidates_ij.isEmpty()) { + Annotation mergedAnnotation = mergeAnnotations(annotation_i,annotation_j); + // If there's no punctuation in the merged annotation + if (!Pattern.matches("\\p{Punct}", mergedAnnotation.getToken())) { + candidatesMap.put(mergedAnnotation, newCandidates_ij); + candidatesMap.remove(annotation_i); + candidatesMap.remove(annotation_j); + } + + newCandidates_i.addAll(newCandidates_ij); + } + } + } + + // Deletes annotation if it's a stop word and doesn't have any matching annotation in the window + if (stopWordFilter.isStopWord(token_i)) { + if (newCandidates_i.isEmpty()) + candidatesMap.remove(annotation_i); + } + } + + + + } + + private Annotation mergeAnnotations(Annotation annotation_i, Annotation annotation_j) { + int offset; + int length; + if (annotation_i.getOffset() < annotation_j.getOffset()) { + offset = annotation_i.getOffset(); + length = annotation_j.getOffset() - offset + annotation_j.getLength(); + } else { + offset = annotation_j.getOffset(); + length = annotation_i.getOffset() - offset + annotation_i.getLength(); + } + return new Annotation(annotation_i.getReferencedDocument(), offset, length); + } + + @Override + public HashMap<Annotation, Set<Entity>> getCandidatesMap(Set<Annotation> annotations) { + HashMap<Annotation, Set<Entity>> candidatesMap = new HashMap<Annotation, Set<Entity>>(); + for (Annotation annotation: annotations) + candidatesMap.put(annotation, getCandidates(annotation)); + + postProcess(candidatesMap, window, stopWordFilter); + + return candidatesMap; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |