From: <lor...@us...> - 2013-12-10 13:16:50
|
Revision: 4203 http://sourceforge.net/p/dl-learner/code/4203 Author: lorenz_b Date: 2013-12-10 13:16:47 +0000 (Tue, 10 Dec 2013) Log Message: ----------- Added SOLR based synatctic index. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/NLPHeuristic.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Index.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/AbstractRelevanceMetric.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/NTriplesFileLuceneSyntacticIndexCreator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SolrSyntacticIndex.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/NLPHeuristic.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/NLPHeuristic.java 2013-12-10 12:52:52 UTC (rev 4202) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/NLPHeuristic.java 2013-12-10 13:16:47 UTC (rev 4203) @@ -112,14 +112,14 @@ // OWLClassExpression owlapiDescription = OWLAPIConverter.getOWLAPIDescription(expression); // Set<Entity> entities = OWLAPIConverter.getEntities(owlapiDescription.getSignature()); Set<Entity> entities = expression.getSignature(); - double sum = 0; - for (Entity entity : entities) { - double relevance = entityRelevance.containsKey(entity) ? entityRelevance.get(entity) : 0;//System.out.println(entity + ":" + relevance); - if(!Double.isInfinite(relevance)){ - sum += relevance; - } - } - score += nlpBonusFactor * sum; +// double sum = 0; +// for (Entity entity : entities) { +// double relevance = entityRelevance.containsKey(entity) ? entityRelevance.get(entity) : 0;//System.out.println(entity + ":" + relevance); +// if(!Double.isInfinite(relevance)){ +// sum += relevance; +// } +// } +// score += nlpBonusFactor * sum; return score; } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-12-10 12:52:52 UTC (rev 4202) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-12-10 13:16:47 UTC (rev 4203) @@ -74,15 +74,15 @@ // System.out.println(tree.headTerminal(headFinder)); head = tree.headTerminal(headFinder).toString(); - // Create a reusable pattern object - TregexPattern patternMW = TregexPattern.compile("__ >># NP"); - // Run the pattern on one particular tree - TregexMatcher matcher = patternMW.matcher(tree); - // Iterate over all of the subtrees that matched - while (matcher.findNextMatchingNode()) { - Tree match = matcher.getMatch(); - // do what we want to with the subtree - } +// // Create a reusable pattern object +// TregexPattern patternMW = TregexPattern.compile("__ >># NP"); +// // Run the pattern on one particular tree +// TregexMatcher matcher = patternMW.matcher(tree); +// // Iterate over all of the subtrees that matched +// while (matcher.findNextMatchingNode()) { +// Tree match = matcher.getMatch(); +// // do what we want to with the subtree +// } } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Index.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Index.java 2013-12-10 12:52:52 UTC (rev 4202) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Index.java 2013-12-10 13:16:47 UTC (rev 4203) @@ -21,11 +21,29 @@ * @return set of documents retrieved based on the given query string */ Set<AnnotatedDocument> getDocuments(Entity entity); + + /** + * Returns a set of documents based on how the underlying index is processing the given + * search string. + * + * @param searchString query specifying the documents to retrieve + * @return set of documents retrieved based on the given query string + */ + long getNumberOfDocumentsFor(Entity entity); + + /** + * Returns a set of documents based on how the underlying index is processing the given + * search string. + * + * @param searchString query specifying the documents to retrieve + * @return set of documents retrieved based on the given query string + */ + long getNumberOfDocumentsFor(Entity... entities); /** * Returns the total number of documents contained in the index. * * @return the total number of documents contained in the index */ - int getTotalNumberOfDocuments(); + long getTotalNumberOfDocuments(); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-12-10 12:52:52 UTC (rev 4202) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-12-10 13:16:47 UTC (rev 4203) @@ -5,6 +5,7 @@ import java.util.Set; import org.dllearner.algorithms.isle.index.AnnotatedDocument; +import org.dllearner.algorithms.isle.index.Index; import org.dllearner.core.owl.Entity; /** @@ -14,7 +15,7 @@ * @author Lorenz Buehmann * @author Daniel Fleischhacker */ -public class SemanticIndex extends HashMap<Entity, Set<AnnotatedDocument>>{ +public class SemanticIndex extends HashMap<Entity, Set<AnnotatedDocument>> implements Index{ private int nrOfDocuments; @@ -49,11 +50,33 @@ this.nrOfDocuments = nrOfDocuments; } - /** - * @return the nrOfDocuments + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Index#getTotalNumberOfDocuments() */ - public int getTotalNrOfDocuments() { + @Override + public long getTotalNumberOfDocuments() { return nrOfDocuments; } + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Index#getNumberOfDocumentsFor(org.dllearner.core.owl.Entity) + */ + @Override + public long getNumberOfDocumentsFor(Entity entity) { + return getDocuments(entity).size(); + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Index#getNumberOfDocumentsFor(org.dllearner.core.owl.Entity[]) + */ + @Override + public long getNumberOfDocumentsFor(Entity... entities) { + + Set<AnnotatedDocument> documents = getDocuments(entities[0]); + for (int i = 1; i < entities.length; i++) { + documents.retainAll(getDocuments(entities[i])); + } + return 0; + } + } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java 2013-12-10 12:52:52 UTC (rev 4202) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java 2013-12-10 13:16:47 UTC (rev 4203) @@ -80,7 +80,7 @@ for (Token token : tokens) { try { Query query = parser.parse(token.getRawForm()); - ScoreDoc[] result = searcher.search(query, getTotalNumberOfDocuments()).scoreDocs; + ScoreDoc[] result = searcher.search(query, indexReader.numDocs()).scoreDocs; for (int i = 0; i < result.length; i++) { Document doc = searcher.doc(result[i].doc); documents.add(new AnnotatedTextDocument( @@ -102,7 +102,7 @@ * @see org.dllearner.algorithms.isle.index.Index#getTotalNumberOfDocuments() */ @Override - public int getTotalNumberOfDocuments() { + public long getTotalNumberOfDocuments() { return indexReader.numDocs(); } @@ -120,5 +120,21 @@ return documents; } + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Index#getNumberOfDocumentsFor(org.dllearner.core.owl.Entity) + */ + @Override + public long getNumberOfDocumentsFor(Entity entity) { + return 0; + } + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Index#getNumberOfDocumentsFor(org.dllearner.core.owl.Entity[]) + */ + @Override + public long getNumberOfDocumentsFor(Entity... entities) { + return 0; + } + + } Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/NTriplesFileLuceneSyntacticIndexCreator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/NTriplesFileLuceneSyntacticIndexCreator.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/NTriplesFileLuceneSyntacticIndexCreator.java 2013-12-10 13:16:47 UTC (rev 4203) @@ -0,0 +1,122 @@ +/** + * + */ +package org.dllearner.algorithms.isle.index.syntactic; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; + +import org.apache.jena.riot.Lang; +import org.apache.jena.riot.RiotReader; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.Version; + +import com.hp.hpl.jena.graph.Triple; + +/** + * Creates a Lucene Index for the labels if classes and properties. + * @author Lorenz Buehmann + * + */ +public class NTriplesFileLuceneSyntacticIndexCreator { + + public NTriplesFileLuceneSyntacticIndexCreator(InputStream nTriplesStream, String indexPath, String searchField) throws IOException { + //setup the index + Directory directory = FSDirectory.open(new File(indexPath)); + + //setup the index analyzer + Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); + IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer); + indexWriterConfig.setRAMBufferSizeMB(1024.0); + indexWriterConfig.setOpenMode(OpenMode.CREATE); + IndexWriter writer = new IndexWriter(directory, indexWriterConfig); + + System.out.println( "Creating index ..." ); + + // setup the index fields, here two fields, for URI and text + FieldType stringType = new FieldType(StringField.TYPE_STORED); + stringType.setStoreTermVectors(false); + FieldType textType = new FieldType(TextField.TYPE_STORED); + textType.setStoreTermVectors(false); + + Set<Document> documents = new HashSet<Document>(); + + Iterator<Triple> iterator = RiotReader.createIteratorTriples(nTriplesStream, Lang.NTRIPLES, null); + + Triple triple; + String text; + String uri; + Document doc; + int i = 0; + while(iterator.hasNext()){ + triple = iterator.next(); + + uri = triple.getSubject().getURI(); + text = triple.getObject().getLiteralLexicalForm(); + + doc = new Document(); + doc.add(new Field("uri", uri, stringType)); + doc.add(new Field(searchField, text, textType)); + + writer.addDocument(doc); + if(i++ % 10000 == 0){ +// writer.commit(); + System.out.println(i); + } + + } + + writer.commit(); + writer.close(); + } + + public static void main(String[] args) throws Exception { + String indexFile = "/home/me/Documents/short_abstracts_en.nt"; +// indexFile = "/tmp/test.nt"; + String indexPath = "/home/me/Documents/dbpedia/short_abstracts_index"; +// indexPath = "/tmp/index"; + String field = "text"; + new NTriplesFileLuceneSyntacticIndexCreator(new FileInputStream(indexFile), indexPath, field); + + IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath))); + IndexSearcher searcher = new IndexSearcher(reader); + Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); + + QueryParser parser = new QueryParser(Version.LUCENE_43, field, analyzer); + Query query = parser.parse("film AND direction"); + + TopDocs docs = searcher.search(query, 10); + ScoreDoc[] scoreDocs = docs.scoreDocs; + + for (int i = 0; i < scoreDocs.length; i++) { + Document doc = searcher.doc(scoreDocs[i].doc); + System.out.println(doc.get(field)); + + } + } + + +} Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SolrSyntacticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SolrSyntacticIndex.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SolrSyntacticIndex.java 2013-12-10 13:16:47 UTC (rev 4203) @@ -0,0 +1,176 @@ +/** + * + */ +package org.dllearner.algorithms.isle.index.syntactic; + +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.SolrServer; +import org.apache.solr.client.solrj.SolrServerException; +import org.apache.solr.client.solrj.impl.HttpSolrServer; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; +import org.dllearner.algorithms.isle.TextDocumentGenerator; +import org.dllearner.algorithms.isle.index.AnnotatedDocument; +import org.dllearner.algorithms.isle.index.AnnotatedTextDocument; +import org.dllearner.algorithms.isle.index.Index; +import org.dllearner.algorithms.isle.index.Token; +import org.dllearner.algorithms.isle.textretrieval.AnnotationEntityTextRetriever; +import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever; +import org.dllearner.core.owl.Entity; +import org.semanticweb.owlapi.model.OWLOntology; + +import com.google.common.base.Joiner; + +/** + * @author Lorenz Buehmann + * + */ +public class SolrSyntacticIndex implements Index{ + + private SolrServer solr; + private AnnotationEntityTextRetriever textRetriever; + private String searchField; + + long totalNumberOfDocuments = -1; + + public SolrSyntacticIndex(OWLOntology ontology, String solrServerURL, String searchField) { + this.searchField = searchField; + solr = new HttpSolrServer(solrServerURL); + textRetriever = new RDFSLabelEntityTextRetriever(ontology); + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Index#getDocuments(org.dllearner.core.owl.Entity) + */ + @Override + public Set<AnnotatedDocument> getDocuments(Entity entity) { + Set<AnnotatedDocument> documents = new HashSet<AnnotatedDocument>(); + + Map<List<Token>, Double> relevantText = textRetriever.getRelevantText(entity); + + for (Entry<List<Token>, Double> entry : relevantText.entrySet()) { + List<Token> tokens = entry.getKey(); + for (Token token : tokens) { + SolrQuery query = new SolrQuery(searchField + ":" + token.getRawForm()); + query.setRows(Integer.MAX_VALUE);//can be very slow + try { + QueryResponse response = solr.query(query); + SolrDocumentList list = response.getResults(); + System.out.println(list.getNumFound()); + for (SolrDocument doc : list) { + String uri = (String) doc.getFieldValue("uri"); + String comment = (String) doc.getFieldValue(searchField); + + documents.add(new AnnotatedTextDocument( + TextDocumentGenerator.getInstance().generateDocument((String) doc.getFieldValue(searchField)), + Collections.EMPTY_SET)); + } + } catch (SolrServerException e) { + e.printStackTrace(); + } + } + } + return documents; + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Index#getTotalNumberOfDocuments() + */ + @Override + public long getTotalNumberOfDocuments() { + if(totalNumberOfDocuments == -1){ + SolrQuery q = new SolrQuery("*:*"); + q.setRows(0); // don't actually request any data + try { + totalNumberOfDocuments = solr.query(q).getResults().getNumFound(); + } catch (SolrServerException e) { + e.printStackTrace(); + } + } + return totalNumberOfDocuments; + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Index#getNumberOfDocumentsFor(org.dllearner.core.owl.Entity) + */ + @Override + public long getNumberOfDocumentsFor(Entity entity) { + Map<List<Token>, Double> relevantText = textRetriever.getRelevantText(entity); + + String queryString = "("; + Set<String> terms = new HashSet<>(); + for (Entry<List<Token>, Double> entry : relevantText.entrySet()) { + List<Token> tokens = entry.getKey(); + String phrase = ""; + for (Token token : tokens) { +// terms.add(token.getRawForm()); + phrase += token.getRawForm() + " "; + } + phrase.trim(); + terms.add(phrase); + } + queryString += Joiner.on("OR").join(terms); + queryString += ")"; + + SolrQuery query = new SolrQuery(searchField + ":" + queryString);System.out.println(query); + try { + QueryResponse response = solr.query(query); + SolrDocumentList list = response.getResults(); + return list.getNumFound(); + } catch (SolrServerException e) { + e.printStackTrace(); + } + return -1; + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Index#getNumberOfDocumentsFor(org.dllearner.core.owl.Entity[]) + */ + @Override + public long getNumberOfDocumentsFor(Entity... entities) { + + Set<String> queryStringParts = new HashSet<>(); + + for (Entity entity : entities) { + Map<List<Token>, Double> relevantText = textRetriever.getRelevantText(entity); + + String queryString = "("; + Set<String> terms = new HashSet<>(); + for (Entry<List<Token>, Double> entry : relevantText.entrySet()) { + List<Token> tokens = entry.getKey(); + String phrase = ""; + for (Token token : tokens) { +// terms.add(token.getRawForm()); + phrase += token.getRawForm() + " "; + } + phrase.trim(); + terms.add(phrase); + } + queryString += Joiner.on("OR").join(terms); + queryString += ")"; + queryStringParts.add(queryString); + } + + String queryStringConjuction = "(" + Joiner.on("AND").join(queryStringParts) + ")"; + + + SolrQuery query = new SolrQuery(searchField + ":" + queryStringConjuction);System.out.println(query); + try { + QueryResponse response = solr.query(query); + SolrDocumentList list = response.getResults(); + return list.getNumFound(); + } catch (SolrServerException e) { + e.printStackTrace(); + } + return -1; + } + +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/AbstractRelevanceMetric.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/AbstractRelevanceMetric.java 2013-12-10 12:52:52 UTC (rev 4202) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/AbstractRelevanceMetric.java 2013-12-10 13:16:47 UTC (rev 4203) @@ -6,7 +6,7 @@ import java.util.HashMap; import java.util.Map; -import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; +import org.dllearner.algorithms.isle.index.Index; import org.semanticweb.owlapi.model.OWLEntity; /** @@ -15,9 +15,9 @@ */ public abstract class AbstractRelevanceMetric implements RelevanceMetric { - protected SemanticIndex index; + protected Index index; - public AbstractRelevanceMetric(SemanticIndex index) { + public AbstractRelevanceMetric(Index index) { this.index = index; } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java 2013-12-10 12:52:52 UTC (rev 4202) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java 2013-12-10 13:16:47 UTC (rev 4203) @@ -6,7 +6,7 @@ import java.util.Set; import org.dllearner.algorithms.isle.index.AnnotatedDocument; -import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; +import org.dllearner.algorithms.isle.index.Index; import org.dllearner.core.owl.Entity; import com.google.common.collect.Sets; @@ -17,21 +17,22 @@ */ public class PMIRelevanceMetric extends AbstractRelevanceMetric { - public PMIRelevanceMetric(SemanticIndex index) { + public PMIRelevanceMetric(Index index) { super(index); } @Override public double getRelevance(Entity entityA, Entity entityB){ - Set<AnnotatedDocument> documentsA = index.getDocuments(entityA); - Set<AnnotatedDocument> documentsB = index.getDocuments(entityB); - Set<AnnotatedDocument> documentsAB = Sets.intersection(documentsA, documentsB); - int nrOfDocuments = index.getTotalNrOfDocuments(); + long nrOfDocumentsA = index.getNumberOfDocumentsFor(entityA); + long nrOfDocumentsB = index.getNumberOfDocumentsFor(entityB); + long nrOfDocumentsAB = index.getNumberOfDocumentsFor(entityA, entityB); - double pA = nrOfDocuments == 0 ? 0 : ((double) documentsA.size() / (double) nrOfDocuments); - double pB = nrOfDocuments == 0 ? 0 : ((double) documentsB.size() / (double) nrOfDocuments); - double pAB = nrOfDocuments == 0 ? 0 : ((double) documentsAB.size() / (double) nrOfDocuments); + long nrOfDocuments = index.getTotalNumberOfDocuments(); + double pA = nrOfDocuments == 0 ? 0 : ((double) nrOfDocumentsA / (double) nrOfDocuments); + double pB = nrOfDocuments == 0 ? 0 : ((double) nrOfDocumentsB / (double) nrOfDocuments); + double pAB = nrOfDocuments == 0 ? 0 : ((double) nrOfDocumentsAB / (double) nrOfDocuments); + double pmi = Math.log(pAB / pA * pB); return pmi; @@ -42,7 +43,7 @@ Set<AnnotatedDocument> documentsA = index.getDocuments(entityA); Set<AnnotatedDocument> documentsB = index.getDocuments(entityB); Set<AnnotatedDocument> documentsAB = Sets.intersection(documentsA, documentsB); - int nrOfDocuments = index.getTotalNrOfDocuments(); + long nrOfDocuments = index.getTotalNumberOfDocuments(); // System.out.println("A:" + documentsA.size()); // System.out.println("B:" + documentsB.size()); // System.out.println("AB:" + documentsAB.size()); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |