From: <lor...@us...> - 2013-12-09 14:22:22
|
Revision: 4193 http://sourceforge.net/p/dl-learner/code/4193 Author: lorenz_b Date: 2013-12-09 14:22:20 +0000 (Mon, 09 Dec 2013) Log Message: ----------- Added generator class for semantic index. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndexGenerator.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-12-03 12:41:34 UTC (rev 4192) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-12-09 14:22:20 UTC (rev 4193) @@ -14,6 +14,9 @@ import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; +import edu.stanford.nlp.trees.CollinsHeadFinder; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation; import edu.stanford.nlp.util.CoreMap; public class TextDocumentGenerator { @@ -26,7 +29,7 @@ private TextDocumentGenerator(){ Properties props = new Properties(); - props.put("annotators", "tokenize, ssplit, pos, lemma"); + props.put("annotators", "tokenize, ssplit, pos, lemma, parse"); pipeline = new StanfordCoreNLP(props); } @@ -58,12 +61,21 @@ //this is the POS tag of the token String lemma = label.get(LemmaAnnotation.class); //check if token is punctuation - boolean isPunctuation = word.matches(punctuationPattern); + boolean isPunctuation = word.matches(punctuationPattern) + || pos.equalsIgnoreCase("-lrb-") + || pos.equalsIgnoreCase("-rrb-") + || word.startsWith("'") + ; //check if it is a stop word - boolean isStopWord = stopWordFilter.isStopWord(word); + boolean isStopWord = stopWordFilter.isStopWord(word.toLowerCase()); Token token = new Token(word, lemma, pos, isPunctuation, isStopWord); - + + //determine the head noun + Tree tree = sentence.get(TreeAnnotation.class); + CollinsHeadFinder headFinder = new CollinsHeadFinder(); + Tree head = headFinder.determineHead(tree); + document.add(token); } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-12-03 12:41:34 UTC (rev 4192) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-12-09 14:22:20 UTC (rev 4193) @@ -4,6 +4,8 @@ import java.util.LinkedList; import java.util.List; +import org.dllearner.algorithms.isle.TextDocumentGenerator; + /** * A simple text document without further formatting or markup. * @@ -11,13 +13,10 @@ */ public class TextDocument extends LinkedList<Token> implements Document { public static void main(String[] args) { - TextDocument t = new TextDocument(); String s = "This is a very long, nice text for testing our new implementation of TextDocument."; - for (String e : s.split(" ")) { - t.add(new Token(e)); - } + TextDocument doc = TextDocumentGenerator.getInstance().generateDocument(s); - System.out.println(t.getRawContent()); + System.out.println(doc.getRawContent()); } @Override Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-12-03 12:41:34 UTC (rev 4192) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-12-09 14:22:20 UTC (rev 4193) @@ -12,6 +12,8 @@ public class TrieLinguisticAnnotator implements LinguisticAnnotator { EntityCandidatesTrie candidatesTrie; private boolean normalizeWords = true; + + private boolean ignoreStopWords = true; public TrieLinguisticAnnotator(EntityCandidatesTrie candidatesTrie) { this.candidatesTrie = candidatesTrie; @@ -30,11 +32,13 @@ List<Token> matchedTokens; for (Token token : document) { - matchedTokens = candidatesTrie.getLongestMatchingText(document.getTokensStartingAtToken(token, true)); - if(matchedTokens != null && !matchedTokens.isEmpty()){ - Annotation annotation = new Annotation(document, matchedTokens); - annotations.add(annotation); - } + if(!(token.isPunctuation() ||token.isStopWord())){ + matchedTokens = candidatesTrie.getLongestMatchingText(document.getTokensStartingAtToken(token, true)); + if(matchedTokens != null && !matchedTokens.isEmpty()){ + Annotation annotation = new Annotation(document, matchedTokens); + annotations.add(annotation); + } + } } return annotations; } Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndexGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndexGenerator.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndexGenerator.java 2013-12-09 14:22:20 UTC (rev 4193) @@ -0,0 +1,163 @@ +package org.dllearner.algorithms.isle.index.semantic; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.util.HashSet; +import java.util.Set; + +import org.apache.log4j.Logger; +import org.dllearner.algorithms.isle.EntityCandidateGenerator; +import org.dllearner.algorithms.isle.TextDocumentGenerator; +import org.dllearner.algorithms.isle.index.AnnotatedDocument; +import org.dllearner.algorithms.isle.index.LinguisticAnnotator; +import org.dllearner.algorithms.isle.index.SemanticAnnotator; +import org.dllearner.algorithms.isle.index.SimpleEntityCandidatesTrie; +import org.dllearner.algorithms.isle.index.TextDocument; +import org.dllearner.algorithms.isle.index.TrieEntityCandidateGenerator; +import org.dllearner.algorithms.isle.index.TrieLinguisticAnnotator; +import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever; +import org.dllearner.algorithms.isle.wsd.StructureBasedWordSenseDisambiguation; +import org.dllearner.algorithms.isle.wsd.WindowBasedContextExtractor; +import org.dllearner.algorithms.isle.wsd.WordSenseDisambiguation; +import org.dllearner.core.owl.Entity; +import org.semanticweb.owlapi.model.OWLAnnotation; +import org.semanticweb.owlapi.model.OWLAnnotationProperty; +import org.semanticweb.owlapi.model.OWLEntity; +import org.semanticweb.owlapi.model.OWLLiteral; +import org.semanticweb.owlapi.model.OWLOntology; + +import com.google.common.hash.HashCode; +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; + +/** + * Interface for an index which is able to resolve a given entity's URI to the set of documents containing + * this entity, i.e., documents which contain words disambiguated to the given entity. + * + * @author Lorenz Buehmann + * @author Daniel Fleischhacker + */ +public abstract class SemanticIndexGenerator { + + static HashFunction hf = Hashing.md5(); + private static final Logger logger = Logger.getLogger(SemanticIndexGenerator.class.getName()); + private static boolean useCache = false; + + public static SemanticIndex generateIndex(Set<String> documents, OWLOntology ontology, WordSenseDisambiguation wordSenseDisambiguation, + EntityCandidateGenerator entityCandidateGenerator, LinguisticAnnotator linguisticAnnotator){ + SemanticAnnotator semanticAnnotator = new SemanticAnnotator(wordSenseDisambiguation, entityCandidateGenerator, linguisticAnnotator); + return generateIndex(documents, ontology, semanticAnnotator); + } + + public static SemanticIndex generateIndex(Set<String> documents, OWLOntology ontology, SemanticAnnotator semanticAnnotator){ + SemanticIndex semanticIndex; + //try to load serialized version + HashCode hc = hf.newHasher().putInt(documents.hashCode()).putInt(ontology.hashCode()).hash(); + File file = new File(hc.toString() + ".ser"); + if(useCache && file.exists()){ + try { + logger.info("Loading semantic index from disk..."); + ObjectInputStream ois = new ObjectInputStream(new FileInputStream(file)); + semanticIndex = (SemanticIndex) ois.readObject(); + ois.close(); + logger.info("...done."); + } catch (Exception e) { + e.printStackTrace(); + semanticIndex = buildIndex(semanticAnnotator, documents); + } + } else { + logger.info("Building semantic index..."); + semanticIndex = buildIndex(semanticAnnotator, documents); + try { + ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(file)); + oos.writeObject(semanticIndex); + oos.close(); + } catch (IOException e1) { + e1.printStackTrace(); + } + logger.info("...done."); + } + return semanticIndex; + } + + public static SemanticIndex generateIndex(Set<String> documents, OWLOntology ontology, boolean useWordNormalization){ + SimpleEntityCandidatesTrie trie; + if (useWordNormalization) { + trie = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), + ontology, new SimpleEntityCandidatesTrie.LemmatizingWordNetNameGenerator(5)); + } + else { + trie = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), + ontology, new SimpleEntityCandidatesTrie.DummyNameGenerator()); + } + trie.printTrie(); + + TrieLinguisticAnnotator linguisticAnnotator = new TrieLinguisticAnnotator(trie); + linguisticAnnotator.setNormalizeWords(useWordNormalization); + + SemanticAnnotator semanticAnnotator = new SemanticAnnotator( + new StructureBasedWordSenseDisambiguation(new WindowBasedContextExtractor(), ontology), + new TrieEntityCandidateGenerator(ontology, trie), + linguisticAnnotator); + return generateIndex(documents, ontology, semanticAnnotator); + } + + public static SemanticIndex generateIndex(OWLOntology ontology, OWLAnnotationProperty annotationProperty, String language, boolean useWordNormalization){ + Set<OWLEntity> schemaEntities = new HashSet<OWLEntity>(); + schemaEntities.addAll(ontology.getClassesInSignature()); + schemaEntities.addAll(ontology.getObjectPropertiesInSignature()); + schemaEntities.addAll(ontology.getDataPropertiesInSignature()); + Set<String> documents = new HashSet<String>(); + for (OWLEntity entity : schemaEntities) { + String label = null; + Set<OWLAnnotation> annotations = entity.getAnnotations(ontology, annotationProperty); + for (OWLAnnotation annotation : annotations) { + if (annotation.getValue() instanceof OWLLiteral) { + OWLLiteral val = (OWLLiteral) annotation.getValue(); + if (language != null) { + if (val.hasLang(language)) { + label = val.getLiteral(); + } + } + else { + label = val.getLiteral(); + } + } + } + if (label != null) { + documents.add(label); + } + } + return generateIndex(documents, ontology, useWordNormalization); + } + + /** + * Precompute the whole index, i.e. iterate over all entities and compute all annotated documents. + */ + private static SemanticIndex buildIndex(SemanticAnnotator semanticAnnotator, Set<String> documents) { + logger.info("Creating semantic index..."); + SemanticIndex index = new SemanticIndex(); + for (String document : documents) { + TextDocument textDocument = TextDocumentGenerator.getInstance().generateDocument(document); + logger.debug("Processing document:" + textDocument); + AnnotatedDocument annotatedDocument = semanticAnnotator.processDocument(textDocument); + for (Entity entity : annotatedDocument.getContainedEntities()) { + Set<AnnotatedDocument> existingAnnotatedDocuments = index.get(entity); + if (existingAnnotatedDocuments == null) { + existingAnnotatedDocuments = new HashSet<AnnotatedDocument>(); + index.put(entity, existingAnnotatedDocuments); + } + existingAnnotatedDocuments.add(annotatedDocument); + } + logger.debug("Annotated document:" + annotatedDocument); + } + int size = documents.size(); + index.setTotalNrOfDocuments(size); + logger.info("...done."); + return index; + } +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-12-03 12:41:34 UTC (rev 4192) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-12-09 14:22:20 UTC (rev 4193) @@ -90,10 +90,13 @@ if (annotation.getValue() instanceof OWLLiteral) { OWLLiteral val = (OWLLiteral) annotation.getValue(); if (val.hasLang(language)) { + //trim String label = val.getLiteral().trim(); if(entity instanceof NamedClass){ label = label.toLowerCase(); } + //remove content in brackets like (...) + label = label.replaceAll("\\s?\\((.*?)\\)", ""); textWithWeight.put(TextDocumentGenerator.getInstance().generateDocument(label), weight); } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |