From: <lor...@us...> - 2013-12-09 15:34:19
|
Revision: 4196 http://sourceforge.net/p/dl-learner/code/4196 Author: lorenz_b Date: 2013-12-09 15:34:15 +0000 (Mon, 09 Dec 2013) Log Message: ----------- Added syntactic index. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/OWLOntologyLuceneSyntacticIndexCreator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/RDFSLabelEntityTextRetriever.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Index.java Removed Paths: ------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SyntacticIndex.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-12-09 14:40:04 UTC (rev 4195) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-12-09 15:34:15 UTC (rev 4196) @@ -17,6 +17,8 @@ import edu.stanford.nlp.trees.CollinsHeadFinder; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation; +import edu.stanford.nlp.trees.tregex.TregexMatcher; +import edu.stanford.nlp.trees.tregex.TregexPattern; import edu.stanford.nlp.util.CoreMap; public class TextDocumentGenerator { @@ -41,6 +43,10 @@ } public TextDocument generateDocument(String text) { + return generateDocument(text, false); + } + + public TextDocument generateDocument(String text, boolean determineHead) { TextDocument document = new TextDocument(); // create an empty Annotation just with the given text Annotation annotatedDocument = new Annotation(text); @@ -53,6 +59,33 @@ List<CoreMap> sentences = annotatedDocument.get(SentencesAnnotation.class); for(CoreMap sentence: sentences) { + + //determine the head noun + String head = null; + if(determineHead){ + //if phrase only contains one single token, the task is trivial + if(sentence.get(TokensAnnotation.class).size() == 1){ + head = sentence.get(TokensAnnotation.class).get(0).get(TextAnnotation.class); + } else { + Tree tree = sentence.get(TreeAnnotation.class); + CollinsHeadFinder headFinder = new CollinsHeadFinder(); +// Tree head = headFinder.determineHead(tree); +// System.out.println(sentence); +// System.out.println(tree.headTerminal(headFinder)); + head = tree.headTerminal(headFinder).toString(); + + // Create a reusable pattern object + TregexPattern patternMW = TregexPattern.compile("__ >># NP"); + // Run the pattern on one particular tree + TregexMatcher matcher = patternMW.matcher(tree); + // Iterate over all of the subtrees that matched + while (matcher.findNextMatchingNode()) { + Tree match = matcher.getMatch(); + // do what we want to with the subtree + } + } + } + for (CoreLabel label: sentence.get(TokensAnnotation.class)) { // this is the text of the token String word = label.get(TextAnnotation.class); @@ -71,10 +104,9 @@ Token token = new Token(word, lemma, pos, isPunctuation, isStopWord); - //determine the head noun - Tree tree = sentence.get(TreeAnnotation.class); - CollinsHeadFinder headFinder = new CollinsHeadFinder(); - Tree head = headFinder.determineHead(tree); + if(determineHead && word.equals(head)){ + token.setIsHead(true); + } document.add(token); } Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Index.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Index.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Index.java 2013-12-09 15:34:15 UTC (rev 4196) @@ -0,0 +1,31 @@ +/** + * + */ +package org.dllearner.algorithms.isle.index; + +import java.util.Set; + +import org.dllearner.core.owl.Entity; + +/** + * @author Lorenz Buehmann + * + */ +public interface Index { + + /** + * Returns a set of documents based on how the underlying index is processing the given + * search string. + * + * @param searchString query specifying the documents to retrieve + * @return set of documents retrieved based on the given query string + */ + Set<AnnotatedDocument> getDocuments(Entity entity); + + /** + * Returns the total number of documents contained in the index. + * + * @return the total number of documents contained in the index + */ + int getTotalNumberOfDocuments(); +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-09 14:40:04 UTC (rev 4195) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-09 15:34:15 UTC (rev 4196) @@ -21,6 +21,7 @@ private String posTag; private boolean isPunctuation; private boolean isStopWord; + private boolean isHead; /// for storing alternative forms of this token, e.g., generated by WordNet synonyms private HashSet<String> alternativeForms; @@ -36,7 +37,7 @@ this.isStopWord = isStopWord; this.alternativeForms = new HashSet<>(); } - + /** * @return the rawForm */ @@ -117,6 +118,20 @@ public void setIsStopWord(boolean isStopWord) { this.isStopWord = isStopWord; } + + /** + * @param wheteher the token is the head of the containg sequence of tokens + */ + public void setIsHead(boolean isHead) { + this.isHead = isHead; + } + + /** + * @return the isHead + */ + public boolean isHead() { + return isHead; + } /* (non-Javadoc) * @see java.lang.Object#toString() Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java 2013-12-09 14:40:04 UTC (rev 4195) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java 2013-12-09 15:34:15 UTC (rev 4196) @@ -3,6 +3,15 @@ */ package org.dllearner.algorithms.isle.index.syntactic; +import java.io.File; +import java.io.IOException; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; @@ -12,71 +21,88 @@ import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.TotalHitCountCollector; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.dllearner.algorithms.isle.TextDocumentGenerator; +import org.dllearner.algorithms.isle.index.AnnotatedDocument; +import org.dllearner.algorithms.isle.index.AnnotatedTextDocument; +import org.dllearner.algorithms.isle.index.Index; import org.dllearner.algorithms.isle.index.TextDocument; +import org.dllearner.algorithms.isle.index.Token; +import org.dllearner.algorithms.isle.textretrieval.AnnotationEntityTextRetriever; +import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever; +import org.dllearner.core.owl.Entity; +import org.semanticweb.owlapi.model.OWLOntology; -import java.io.File; -import java.io.IOException; -import java.util.HashSet; -import java.util.Set; - /** * @author Lorenz Buehmann * */ -public class LuceneSyntacticIndex implements SyntacticIndex { +public class LuceneSyntacticIndex implements Index { private IndexSearcher searcher; private QueryParser parser; private IndexReader indexReader; private String searchField; + + AnnotationEntityTextRetriever textRetriever; - public LuceneSyntacticIndex(IndexReader indexReader, String searchField) throws Exception { + public LuceneSyntacticIndex(OWLOntology ontology, IndexReader indexReader, String searchField) throws Exception { this.indexReader = indexReader; this.searchField = searchField; searcher = new IndexSearcher(indexReader); StandardAnalyzer analyzer = new StandardAnalyzer( Version.LUCENE_43); parser = new QueryParser( Version.LUCENE_43, searchField, analyzer ); + + textRetriever = new RDFSLabelEntityTextRetriever(ontology); } - public LuceneSyntacticIndex(Directory directory, String searchField) throws Exception { - this(DirectoryReader.open(directory), searchField); + public LuceneSyntacticIndex(OWLOntology ontology, Directory directory, String searchField) throws Exception { + this(ontology, DirectoryReader.open(directory), searchField); } - public LuceneSyntacticIndex(String indexDirectory, String searchField) throws Exception { - this(DirectoryReader.open(FSDirectory.open(new File(indexDirectory))), searchField); + public LuceneSyntacticIndex(OWLOntology ontology, String indexDirectory, String searchField) throws Exception { + this(ontology, DirectoryReader.open(FSDirectory.open(new File(indexDirectory))), searchField); } /* (non-Javadoc) * @see org.dllearner.algorithms.isle.SyntacticIndex#getDocuments(java.lang.String) */ @Override - public Set<org.dllearner.algorithms.isle.index.Document> getDocuments(String searchString) { - Set<org.dllearner.algorithms.isle.index.Document> documents = new HashSet<org.dllearner.algorithms.isle.index.Document>(); - try { - Query query = parser.parse(searchString); - ScoreDoc[] result = searcher.search(query, getSize()).scoreDocs; - for (int i = 0; i < result.length; i++) { - Document doc = searcher.doc(result[i].doc); - documents.add(TextDocumentGenerator.getInstance().generateDocument(doc.get(searchField))); + public Set<AnnotatedDocument> getDocuments(Entity entity) { + Set<AnnotatedDocument> documents = new HashSet<AnnotatedDocument>(); + + Map<List<Token>, Double> relevantText = textRetriever.getRelevantText(entity); + + for (Entry<List<Token>, Double> entry : relevantText.entrySet()) { + List<Token> tokens = entry.getKey(); + for (Token token : tokens) { + try { + Query query = parser.parse(token.getRawForm()); + ScoreDoc[] result = searcher.search(query, getTotalNumberOfDocuments()).scoreDocs; + for (int i = 0; i < result.length; i++) { + Document doc = searcher.doc(result[i].doc); + documents.add(new AnnotatedTextDocument( + TextDocumentGenerator.getInstance().generateDocument(doc.get(searchField)), + Collections.EMPTY_SET)); + } + } catch (ParseException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } } - } catch (ParseException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); } + return documents; } /* (non-Javadoc) - * @see org.dllearner.algorithms.isle.SyntacticIndex#getSize() + * @see org.dllearner.algorithms.isle.index.Index#getTotalNumberOfDocuments() */ @Override - public int getSize() { + public int getTotalNumberOfDocuments() { return indexReader.numDocs(); } @@ -94,22 +120,5 @@ return documents; } - /* (non-Javadoc) - * @see org.dllearner.algorithms.isle.SyntacticIndex#count(java.lang.String) - */ - @Override - public int count(String searchString) { - try { - Query query = parser.parse(searchString); - TotalHitCountCollector results = new TotalHitCountCollector(); - searcher.search(query, results); - return results.getTotalHits(); - } catch (ParseException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); - } - return -1; - } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/OWLOntologyLuceneSyntacticIndexCreator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/OWLOntologyLuceneSyntacticIndexCreator.java 2013-12-09 14:40:04 UTC (rev 4195) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/OWLOntologyLuceneSyntacticIndexCreator.java 2013-12-09 15:34:15 UTC (rev 4196) @@ -3,6 +3,10 @@ */ package org.dllearner.algorithms.isle.index.syntactic; +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Field; @@ -14,14 +18,17 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; -import org.semanticweb.owlapi.model.*; +import org.dllearner.algorithms.isle.index.Index; +import org.semanticweb.owlapi.model.OWLAnnotation; +import org.semanticweb.owlapi.model.OWLAnnotationProperty; +import org.semanticweb.owlapi.model.OWLDataFactory; +import org.semanticweb.owlapi.model.OWLEntity; +import org.semanticweb.owlapi.model.OWLLiteral; +import org.semanticweb.owlapi.model.OWLOntology; import org.semanticweb.owlapi.vocab.OWLRDFVocabulary; + import uk.ac.manchester.cs.owl.owlapi.OWLDataFactoryImpl; -import java.io.IOException; -import java.util.HashSet; -import java.util.Set; - /** * Creates a Lucene Index for the labels if classes and properties. * @author Lorenz Buehmann @@ -49,7 +56,7 @@ schemaEntities.addAll(ontology.getDataPropertiesInSignature()); } - public SyntacticIndex buildIndex() throws Exception{ + public Index buildIndex() throws Exception{ Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer); IndexWriter writer = new IndexWriter(directory, indexWriterConfig); Deleted: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SyntacticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SyntacticIndex.java 2013-12-09 14:40:04 UTC (rev 4195) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SyntacticIndex.java 2013-12-09 15:34:15 UTC (rev 4196) @@ -1,43 +0,0 @@ -/** - * - */ -package org.dllearner.algorithms.isle.index.syntactic; - -import org.dllearner.algorithms.isle.index.Document; - -import java.util.Set; - -/** - * Interface for a syntactic index, e.g., a basic string-based inverted index. - * - * @author Lorenz Buehmann - * @author Daniel Fleischhacker - */ -public interface SyntacticIndex { - - /** - * Returns a set of documents based on how the underlying index is processing the given - * search string. - * - * @param searchString query specifying the documents to retrieve - * @return set of documents retrieved based on the given query string - */ - Set<Document> getDocuments(String searchString); - - /** - * Returns the number of documents based on how the underlying index is processing the - * given search string. - * - * @param searchString query specifying the documents to include in the number of documents - * @return number of documents retrieved based on the given query string - */ - int count(String searchString); - - /** - * Returns the total number of documents contained in the index. - * - * @return the total number of documents contained in the index - */ - int getSize(); - -} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java 2013-12-09 14:40:04 UTC (rev 4195) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java 2013-12-09 15:34:15 UTC (rev 4196) @@ -3,6 +3,13 @@ */ package org.dllearner.algorithms.isle.index.syntactic; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Field; @@ -14,15 +21,9 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.util.Version; +import org.dllearner.algorithms.isle.index.Index; import org.dllearner.algorithms.isle.index.TextDocument; -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.util.HashSet; -import java.util.Set; - /** * Creates a syntactic index from text files stored on disk * @@ -39,7 +40,7 @@ this.inputDirectory = inputDirectory; } - public SyntacticIndex buildIndex() throws Exception{ + public Index buildIndex() throws Exception{ Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer); IndexWriter writer = new IndexWriter(indexDirectory, indexWriterConfig); @@ -79,7 +80,7 @@ return new LuceneSyntacticIndex(indexDirectory, searchField); } - public SyntacticIndex buildIndex(Set<TextDocument> documents) throws Exception{ + public Index buildIndex(Set<TextDocument> documents) throws Exception{ Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer); IndexWriter writer = new IndexWriter(indexDirectory, indexWriterConfig); @@ -106,7 +107,7 @@ return new LuceneSyntacticIndex(indexDirectory, searchField); } - public static SyntacticIndex loadIndex(File indexDirectory) throws Exception { + public static Index loadIndex(File indexDirectory) throws Exception { return new LuceneSyntacticIndex(new SimpleFSDirectory(indexDirectory), searchField); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-12-09 14:40:04 UTC (rev 4195) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-12-09 15:34:15 UTC (rev 4196) @@ -46,6 +46,7 @@ private boolean useShortFormFallback = true; private IRIShortFormProvider sfp = new SimpleIRIShortFormProvider(); + protected boolean determineHeadNoun = false; private OWLAnnotationProperty[] properties; @@ -97,7 +98,7 @@ } //remove content in brackets like (...) label = label.replaceAll("\\s?\\((.*?)\\)", ""); - textWithWeight.put(TextDocumentGenerator.getInstance().generateDocument(label), weight); + textWithWeight.put(TextDocumentGenerator.getInstance().generateDocument(label, determineHeadNoun), weight); } } } @@ -107,7 +108,7 @@ String shortForm = sfp.getShortForm(IRI.create(entity.getURI())); shortForm = Joiner.on(" ").join(LinguisticUtil.getInstance().getWordsFromCamelCase(shortForm)); shortForm = Joiner.on(" ").join(LinguisticUtil.getInstance().getWordsFromUnderscored(shortForm)).trim(); - textWithWeight.put(TextDocumentGenerator.getInstance().generateDocument(shortForm), weight); + textWithWeight.put(TextDocumentGenerator.getInstance().generateDocument(shortForm, determineHeadNoun), weight); } return textWithWeight; Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/RDFSLabelEntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/RDFSLabelEntityTextRetriever.java 2013-12-09 14:40:04 UTC (rev 4195) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/RDFSLabelEntityTextRetriever.java 2013-12-09 15:34:15 UTC (rev 4196) @@ -34,10 +34,12 @@ public RDFSLabelEntityTextRetriever(OWLOntology ontology) { super(ontology, new OWLDataFactoryImpl().getOWLAnnotationProperty(OWLRDFVocabulary.RDFS_LABEL.getIRI())); + determineHeadNoun = true; } public RDFSLabelEntityTextRetriever(OWLAPIOntology ontology) { super(ontology, new OWLDataFactoryImpl().getOWLAnnotationProperty(OWLRDFVocabulary.RDFS_LABEL.getIRI())); + determineHeadNoun = true; } public static void main(String[] args) throws Exception { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |