From: <lor...@us...> - 2013-10-15 12:06:49
|
Revision: 4122 http://sourceforge.net/p/dl-learner/code/4122 Author: lorenz_b Date: 2013-10-15 12:06:45 +0000 (Tue, 15 Oct 2013) Log Message: ----------- Added context extractor based on token sentence. Almost finished WSD. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/ContextExtractor.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/StructureBasedWordSenseDisambiguation.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WindowBasedContextExtractor.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/ContextExtractor.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/ContextExtractor.java 2013-10-11 21:29:34 UTC (rev 4121) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/ContextExtractor.java 2013-10-15 12:06:45 UTC (rev 4122) @@ -3,13 +3,16 @@ */ package org.dllearner.algorithms.isle.wsd; -import java.util.Set; +import java.util.List; +import org.dllearner.algorithms.isle.index.Annotation; + + /** * @author Lorenz Buehmann * */ public interface ContextExtractor { - Set<String> extractContext(String token, String document); + List<String> extractContext(Annotation annotation); } Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java 2013-10-15 12:06:45 UTC (rev 4122) @@ -0,0 +1,89 @@ +/** + * + */ +package org.dllearner.algorithms.isle.wsd; + +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; + +import org.dllearner.algorithms.isle.index.TextDocument; + +import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.pipeline.StanfordCoreNLP; +import edu.stanford.nlp.util.CoreMap; + +/** + * @author Lorenz Buehmann + * + */ +public class SentenceBasedContextExtractor implements ContextExtractor{ + + private StanfordCoreNLP pipeline; + + public SentenceBasedContextExtractor() { + Properties props = new Properties(); + props.put("annotators", "tokenize, ssplit"); + pipeline = new StanfordCoreNLP(props); + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.wsd.ContextExtractor#extractContext(java.lang.String, java.lang.String) + */ + @Override + public List<String> extractContext(org.dllearner.algorithms.isle.index.Annotation annotation) { + //split text into sentences + List<CoreMap> sentences = getSentences(annotation.getReferencedDocument().getRawContent()); + + //find the sentence containing the token of the annotation + int tokenStart = annotation.getOffset(); + int index = 0; + for (CoreMap sentence : sentences) { + String s = sentence.toString(); + if (index < tokenStart && s.length() > tokenStart) { + List<String> context = new ArrayList<String>(); + for (CoreLabel label : sentence.get(TokensAnnotation.class)) { + // this is the text of the token + String word = label.get(TextAnnotation.class); + + if(!word.isEmpty() && !word.matches("\\p{Punct}")){ + context.add(word); + } + } + return context; + } + index += s.length(); + } + throw new RuntimeException("Token " + annotation.getToken() + " not found in text " + annotation.getReferencedDocument().getRawContent()); + } + + private List<CoreMap> getSentences(String document) { + // create an empty Annotation just with the given text + Annotation annotation = new Annotation(document); + + // run all Annotators on this text + pipeline.annotate(annotation); + + // these are all the sentences in this document + // a CoreMap is essentially a Map that uses class objects as keys and + // has values with custom types + List<CoreMap> sentences = annotation.get(SentencesAnnotation.class); + + return sentences; + } + + public static void main(String[] args) throws Exception { + String s = "International Business Machines Corporation, or IBM, is an American multinational services technology and consulting corporation, with headquarters in Armonk, New York, United States. IBM manufactures and markets computer hardware and software," + + " and offers infrastructure, hosting and consulting services in areas ranging from mainframe computers to nanotechnology."; + + String token = "services"; + SentenceBasedContextExtractor extractor = new SentenceBasedContextExtractor(); + List<String> context = extractor.extractContext(new org.dllearner.algorithms.isle.index.Annotation(new TextDocument(s), s.indexOf(token), token.length())); + System.out.println(context); + } + +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/StructureBasedWordSenseDisambiguation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/StructureBasedWordSenseDisambiguation.java 2013-10-11 21:29:34 UTC (rev 4121) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/StructureBasedWordSenseDisambiguation.java 2013-10-15 12:06:45 UTC (rev 4122) @@ -3,15 +3,22 @@ */ package org.dllearner.algorithms.isle.wsd; +import java.io.IOException; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; import java.util.Set; import org.dllearner.algorithms.isle.StructuralEntityContext; +import org.dllearner.algorithms.isle.VSMCosineDocumentSimilarity; import org.dllearner.algorithms.isle.index.Annotation; import org.dllearner.algorithms.isle.index.SemanticAnnotation; import org.dllearner.core.owl.Entity; -import org.semanticweb.owlapi.model.OWLEntity; import org.semanticweb.owlapi.model.OWLOntology; +import com.google.common.base.Joiner; +import com.google.common.collect.Sets; + /** * @author Lorenz Buehmann * @@ -33,13 +40,52 @@ */ @Override public SemanticAnnotation disambiguate(Annotation annotation, Set<Entity> candidateEntities) { - //get the context of the annotated token - Set<String> tokenContext = contextExtractor.extractContext(annotation.getToken(), annotation.getReferencedDocument().getContent()); - //compare this context with the context of each entity candidate - for (Entity entity : candidateEntities) { - Set<String> entityContext = StructuralEntityContext.getContextInNaturalLanguage(ontology, entity); + if(!candidateEntities.isEmpty()){ + //get the context of the annotated token + List<String> tokenContext = contextExtractor.extractContext(annotation); + //compare this context with the context of each entity candidate + double maxScore = Double.MIN_VALUE; + Entity bestEntity = null; + for (Entity entity : candidateEntities) { + //get the context of the entity by analyzing the structure of the ontology + Set<String> entityContext = StructuralEntityContext.getContextInNaturalLanguage(ontology, entity); + //compute the VSM Cosine Similarity + double score = computeScore(tokenContext, entityContext); + //set best entity + if(score > maxScore){ + maxScore = score; + bestEntity = entity; + } + } + + return new SemanticAnnotation(annotation, bestEntity); } return null; } + + /** + * Compute the overlap between 2 set of words + * @param words1 + * @param words2 + * @return + */ + private double computeScoreSimple(Collection<String> words1, Collection<String> words2){ + return Sets.intersection(new HashSet<String>(words1), new HashSet<String>(words2)).size(); + } + + /** + * Compute the Cosine Similarity using as VSM. + * @param words1 + * @param words2 + */ + private double computeScore(Collection<String> words1, Collection<String> words2){ + double score = 0d; + try { + score = VSMCosineDocumentSimilarity.getCosineSimilarity(Joiner.on(" ").join(words1), Joiner.on(" ").join(words2)); + } catch (IOException e) { + e.printStackTrace(); + } + return score; + } } Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WindowBasedContextExtractor.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WindowBasedContextExtractor.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WindowBasedContextExtractor.java 2013-10-15 12:06:45 UTC (rev 4122) @@ -0,0 +1,95 @@ +/** + * + */ +package org.dllearner.algorithms.isle.wsd; + +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; + +import org.dllearner.algorithms.isle.index.TextDocument; + +import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.pipeline.StanfordCoreNLP; +import edu.stanford.nlp.util.CoreMap; + +/** + * @author Lorenz Buehmann + * + */ +public class WindowBasedContextExtractor implements ContextExtractor{ + + private StanfordCoreNLP pipeline; + + /** + * + */ + public WindowBasedContextExtractor() { + + Properties props = new Properties(); + props.put("annotators", "tokenize, ssplit"); + pipeline = new StanfordCoreNLP(props); + + + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.wsd.ContextExtractor#extractContext(java.lang.String, java.lang.String) + */ + @Override + public List<String> extractContext(org.dllearner.algorithms.isle.index.Annotation annotation) { + // split text into sentences + List<CoreMap> sentences = getSentences(annotation.getReferencedDocument().getRawContent()); + + // find the sentence containing the token of the annotation + int tokenStart = annotation.getOffset(); + int index = 0; + for (CoreMap sentence : sentences) { + String s = sentence.toString(); + if (index < tokenStart && s.length() > tokenStart) { + List<String> context = new ArrayList<String>(); + for (CoreLabel label : sentence.get(TokensAnnotation.class)) { + // this is the text of the token + String word = label.get(TextAnnotation.class); + + context.add(word); + } + return context; + } + index += s.length(); + } + throw new RuntimeException("Token " + annotation.getToken() + " not found in text " + + annotation.getReferencedDocument().getRawContent()); + + } + + private List<CoreMap> getSentences(String document) { + // create an empty Annotation just with the given text + Annotation annotation = new Annotation(document); + + // run all Annotators on this text + pipeline.annotate(annotation); + + // these are all the sentences in this document + // a CoreMap is essentially a Map that uses class objects as keys and + // has values with custom types + List<CoreMap> sentences = annotation.get(SentencesAnnotation.class); + + return sentences; + } + + public static void main(String[] args) throws Exception { + String s = "International Business Machines Corporation, or IBM, is an American multinational services technology and consulting corporation, with headquarters in Armonk, New York, United States. IBM manufactures and markets computer hardware and software," + + " and offers infrastructure, hosting and consulting services in areas ranging from mainframe computers to nanotechnology."; + + String token = "services"; + WindowBasedContextExtractor extractor = new WindowBasedContextExtractor(); + List<String> context = extractor.extractContext(new org.dllearner.algorithms.isle.index.Annotation(new TextDocument(s), s.indexOf(token), token.length())); + System.out.println(context); + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |