From: <lor...@us...> - 2013-07-17 11:44:44
|
Revision: 4021 http://sourceforge.net/p/dl-learner/code/4021 Author: lorenz_b Date: 2013-07-17 11:44:41 +0000 (Wed, 17 Jul 2013) Log Message: ----------- Refactored ISLE components. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityExtraction.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LuceneSyntacticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/OWLOntologyLuceneSyntacticIndexCreator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticIndexCreator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleSemanticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SyntacticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/AbstractRelevanceMetric.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/RelevanceMetric.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/RelevanceUtils.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/EntityTextRetriever.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/RDFSCommentEntityTextRetriever.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/RDFSLabelEntityTextRetriever.java Removed Paths: ------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/AnnotationEntityTextRetriever.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityTextRetriever.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/LuceneBasedRelevance.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/LuceneDocument.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/LuceneIndexer.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/LuceneSearcher.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/OWLOntologyLuceneIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/PMILuceneBasedRelevance.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/RDFSCommentEntityTextRetriever.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/RDFSLabelEntityTextRetriever.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/Relevance.java Deleted: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/AnnotationEntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/AnnotationEntityTextRetriever.java 2013-07-16 05:25:41 UTC (rev 4020) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/AnnotationEntityTextRetriever.java 2013-07-17 11:44:41 UTC (rev 4021) @@ -1,93 +0,0 @@ -/** - * - */ -package org.dllearner.algorithms.isle; - -import java.util.HashMap; -import java.util.Map; -import java.util.Set; - -import org.dllearner.core.owl.Entity; -import org.dllearner.kb.OWLAPIOntology; -import org.dllearner.utilities.owl.OWLAPIConverter; -import org.semanticweb.owlapi.model.IRI; -import org.semanticweb.owlapi.model.OWLAnnotation; -import org.semanticweb.owlapi.model.OWLAnnotationProperty; -import org.semanticweb.owlapi.model.OWLEntity; -import org.semanticweb.owlapi.model.OWLLiteral; -import org.semanticweb.owlapi.model.OWLOntology; -import org.semanticweb.owlapi.model.OWLOntologyManager; -import org.semanticweb.owlapi.util.IRIShortFormProvider; -import org.semanticweb.owlapi.util.SimpleIRIShortFormProvider; - - -/** - * @author Lorenz Buehmann - * - */ -public class AnnotationEntityTextRetriever implements EntityTextRetriever{ - - private OWLOntology ontology; - private OWLOntologyManager manager; - - private String language = "en"; - private double weight = 1d; - - private boolean useShortFormFallback = true; - private IRIShortFormProvider sfp = new SimpleIRIShortFormProvider(); - - private OWLAnnotationProperty[] properties; - - public AnnotationEntityTextRetriever(OWLOntology ontology, OWLAnnotationProperty... properties) { - this.ontology = ontology; - this.properties = properties; - } - - public AnnotationEntityTextRetriever(OWLAPIOntology ontology, OWLAnnotationProperty... properties) { - this.ontology = ontology.createOWLOntology(manager); - } - - /** - * @param language the language to set - */ - public void setLanguage(String language) { - this.language = language; - } - - /** - * Whether to use the short form of the IRI as fallback, if no label is given. - * @param useShortFormFallback the useShortFormFallback to set - */ - public void setUseShortFormFallback(boolean useShortFormFallback) { - this.useShortFormFallback = useShortFormFallback; - } - - /* (non-Javadoc) - * @see org.dllearner.algorithms.isle.EntityTextRetriever#getRelevantText(org.dllearner.core.owl.Entity) - */ - @Override - public Map<String, Double> getRelevantText(Entity entity) { - Map<String, Double> textWithWeight = new HashMap<String, Double>(); - - OWLEntity e = OWLAPIConverter.getOWLAPIEntity(entity); - - for (OWLAnnotationProperty property : properties) { - Set<OWLAnnotation> annotations = e.getAnnotations(ontology, property); - for (OWLAnnotation annotation : annotations) { - if (annotation.getValue() instanceof OWLLiteral) { - OWLLiteral val = (OWLLiteral) annotation.getValue(); - if (val.hasLang(language)) { - String label = val.getLiteral(); - textWithWeight.put(label, weight); - } - } - } - } - - if(textWithWeight.isEmpty() && useShortFormFallback){ - textWithWeight.put(sfp.getShortForm(IRI.create(entity.getURI())), weight); - } - - return textWithWeight; - } -} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityExtraction.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityExtraction.java 2013-07-16 05:25:41 UTC (rev 4020) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityExtraction.java 2013-07-17 11:44:41 UTC (rev 4021) @@ -4,6 +4,7 @@ package org.dllearner.algorithms.isle; import java.util.Map; +import java.util.Set; import org.dllearner.core.owl.Entity; @@ -17,7 +18,7 @@ * Extracts all entities contained in the working text with some confidence value. * @return */ - Map<Entity, Double> extractEntities(); + Map<Entity, Set<String>> extractEntities(); /** * Extracts all entities of the given <code>type</code> contained in the working text with some confidence value. Deleted: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityTextRetriever.java 2013-07-16 05:25:41 UTC (rev 4020) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityTextRetriever.java 2013-07-17 11:44:41 UTC (rev 4021) @@ -1,48 +0,0 @@ -/** - * Copyright (C) 2007-2011, Jens Lehmann - * - * This file is part of DL-Learner. - * - * DL-Learner is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * DL-Learner is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -package org.dllearner.algorithms.isle; - -import java.util.Map; - -import org.dllearner.core.owl.Entity; - -/** - * Interface for methods, which retrieve relevant texts given an entity - * in an ontology. An entity text retriever can do simple operations such - * as converting the URI into text or retrieving an rdfs:label, but could - * also search web pages for textual explanations of an entity. - * - * @author Jens Lehmann - * - */ -public interface EntityTextRetriever { - - /** - * The method retrieves a string or a set of strings, which is weighted by - * importance with respect to the entity. For instance, an rdfs:label of - * an entity can be given more weight than an rdfs:comment, which in turn - * can be more important than a description retrieved from a web page. - * - * @param entity The entity to handle. - * @return A weighted set of strings. For a value x, we need to have 0 <= x <= 1. - */ - public Map<String, Double> getRelevantText(Entity entity); - -} Deleted: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/LuceneBasedRelevance.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/LuceneBasedRelevance.java 2013-07-16 05:25:41 UTC (rev 4020) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/LuceneBasedRelevance.java 2013-07-17 11:44:41 UTC (rev 4021) @@ -1,145 +0,0 @@ -/** - * Copyright (C) 2007-2011, Jens Lehmann - * - * This file is part of DL-Learner. - * - * DL-Learner is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * DL-Learner is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - - -package org.dllearner.algorithms.isle; - -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Set; - -import org.dllearner.core.owl.Entity; -import org.dllearner.utilities.owl.OWLAPIConverter; -import org.semanticweb.owlapi.model.OWLEntity; -import org.semanticweb.owlapi.model.OWLOntology; - - -public abstract class LuceneBasedRelevance implements Relevance{ - - private EntityTextRetriever textRetriever; - private LuceneSearcher searcher; - private OWLOntology ontology; - private Set<OWLEntity> entities; - -// public void printScores() throws Exception { -// for( OWLClass c: m_classes ) -// { -// Map<OWLEntity,Double> hmEntity2Score = getEntityRelevance(c); -// // normalization per class? -// hmEntity2Score = normalize( hmEntity2Score ); -// for( OWLEntity e : hmEntity2Score.keySet() ) -// { -// double dScore = hmEntity2Score.get(e); -// System.out.println( "P( "+ getLabel(c) +", "+ getLabel(e) +" ) = "+ dScore ); -// } -// } -// m_searcher.close(); -// } - - public LuceneBasedRelevance(OWLOntology ontology, LuceneSearcher searcher, EntityTextRetriever textRetriever) { - this.searcher = searcher; - this.ontology = ontology; - this.textRetriever = textRetriever; - - entities = new HashSet<OWLEntity>(); - entities.addAll(ontology.getClassesInSignature()); - entities.addAll(ontology.getObjectPropertiesInSignature()); - entities.addAll(ontology.getDataPropertiesInSignature()); - } - - public Map<OWLEntity,Double> normalizeMinMax( Map<OWLEntity,Double> hmEntity2Score ){ - Map<OWLEntity,Double> hmEntity2Norm = new HashMap<OWLEntity,Double>(); - double dMin = Double.MAX_VALUE; - Double dMax = Double.MIN_VALUE; - for( OWLEntity e : hmEntity2Score.keySet() ) - { - double dValue = hmEntity2Score.get(e); - if( dValue < dMin ){ - dMin = dValue; - } - else if( dValue > dMax ){ - dMax = dValue; - } - } - // System.out.println( "min="+ dMin +" max="+ dMax ); - for( OWLEntity e : hmEntity2Score.keySet() ) - { - double dValue = hmEntity2Score.get(e); - double dNorm = 0; - if( dMin == dMax ){ - dNorm = dValue; - } - else { - dNorm = ( dValue - dMin ) / ( dMax - dMin ); - } - hmEntity2Norm.put( e, dNorm ); - } - return hmEntity2Norm; - } - - @Override - public Map<Entity,Double> getEntityRelevance(Entity entity) throws Exception { - // computes relevance of entity for this class - // conditional probability: P(C,E)=f(C,E)/f(E) - // PMI(C,E)=log( P(C,E) / P(C) ) - Map<Entity, Double> hmEntity2Score = new HashMap<Entity, Double>(); - Map<String, Double> relevantText = textRetriever.getRelevantText(entity); - - for (Entry<String, Double> entry : relevantText.entrySet()) { - String text = entry.getKey(); - Double value = entry.getValue(); - - String sClass = text; - int nrOfDocumentsA = searcher.count(sClass); - int nrOfDocuments = searcher.indexSize(); - - for (OWLEntity otherEntity : entities) { - - Map<String, Double> otherRelevantText = textRetriever.getRelevantText(OWLAPIConverter - .getEntity(otherEntity)); - - for (Entry<String, Double> entry2 : otherRelevantText.entrySet()) { - String otherText = entry2.getKey(); - Double otherValue = entry2.getValue(); - - String sEntity = otherText; - int nrOfDocumentsB = searcher.count(sEntity); - int nrOfDocumentsAB = searcher.count(sClass + " AND " + sEntity); - // double dPEntity = (double)iEntity / (double)iAll; - - double score = computeScore(nrOfDocuments, nrOfDocumentsA, nrOfDocumentsB, nrOfDocumentsAB); - if (!Double.isNaN(score)){// && !Double.isInfinite(score)) { - hmEntity2Score.put(OWLAPIConverter.getEntity(otherEntity), score); - } - } - } - } - - return hmEntity2Score; - } - - /** - * Computes the score which is returned in {@link org.dllearner.algorithms.isle.LuceneBasedRelevance#getEntityRelevance} - * @return - */ - public abstract double computeScore(int nrOfDocuments, int nrOfDocumentsA, int nrOfDocumentsB, int nrOfDocumentsAB); - -} \ No newline at end of file Deleted: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/LuceneDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/LuceneDocument.java 2013-07-16 05:25:41 UTC (rev 4020) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/LuceneDocument.java 2013-07-17 11:44:41 UTC (rev 4021) @@ -1,43 +0,0 @@ -/** - * Copyright (C) 2007-2011, Jens Lehmann - * - * This file is part of DL-Learner. - * - * DL-Learner is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * DL-Learner is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - - -package org.dllearner.algorithms.isle; - -import java.io.File; -import java.io.FileReader; - -import org.apache.lucene.document.DateTools; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; - - -public class LuceneDocument { - - public static Document Document( File f ) throws java.io.FileNotFoundException { - Document doc = new Document(); - doc.add( new Field( "path", f.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED ) ); - doc.add( new Field( "modified", - DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE), - Field.Store.YES, Field.Index.NOT_ANALYZED)); - doc.add( new Field( "contents", new FileReader(f) ) ); - return doc; - } -} - Deleted: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/LuceneIndexer.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/LuceneIndexer.java 2013-07-16 05:25:41 UTC (rev 4020) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/LuceneIndexer.java 2013-07-17 11:44:41 UTC (rev 4021) @@ -1,100 +0,0 @@ -/** - * Copyright (C) 2007-2011, Jens Lehmann - * - * This file is part of DL-Learner. - * - * DL-Learner is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * DL-Learner is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - - -package org.dllearner.algorithms.isle; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.Date; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.util.Version; - - -public class LuceneIndexer { - - static final File INDEX = new File( "index" ); - - public static void main( String[] args ) { - if( INDEX.exists() ) - { - System.out.println("<delete index!>"); - System.exit(1); - } -// final File docDir = new File( args[0] ); -// LuceneIndexer indexer = new LuceneIndexer( docDir ); - } - - @SuppressWarnings("deprecation") - public LuceneIndexer( File docDir ){ - System.out.println( "LuceneIndex: "+ docDir ); - Date start = new Date(); - try { - - Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); - IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer); - IndexWriter writer = new IndexWriter( FSDirectory.open( INDEX ), indexWriterConfig); - System.out.println( "Creating index ..." ); - index( writer, docDir ); - System.out.println( "Optimizing index ..." ); - writer.close(); - Date end = new Date(); - System.out.println( end.getTime() - start.getTime() + " total milliseconds" ); - } - catch (IOException e) { - e.printStackTrace(); - } - } - - private void index( IndexWriter writer, File file ) throws IOException { - // System.out.println( "LuceneIndexer.index: "+ file ); - if( file.canRead() ) - { - if( file.isDirectory() ) - { - String[] files = file.list(); - if( files != null ) - { - for( int i = 0; i < files.length; i++ ) { - index( writer, new File( file, files[i] ) ); - } - } - } - else { - // System.out.println( "Indexer.index: adding " + file ); - try { - writer.addDocument( LuceneDocument.Document( file ) ); - } - catch (FileNotFoundException fnfe) { - fnfe.printStackTrace(); - } - } - } - else { - System.out.println( "<cannot read file!>" ); - } - } - -} Deleted: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/LuceneSearcher.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/LuceneSearcher.java 2013-07-16 05:25:41 UTC (rev 4020) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/LuceneSearcher.java 2013-07-17 11:44:41 UTC (rev 4021) @@ -1,176 +0,0 @@ -/** - * Copyright (C) 2007-2011, Jens Lehmann - * - * This file is part of DL-Learner. - * - * DL-Learner is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * DL-Learner is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - - -package org.dllearner.algorithms.isle; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.index.AtomicReaderContext; -import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.queryparser.classic.QueryParser; -import org.apache.lucene.search.Collector; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.util.Version; - -public class LuceneSearcher { - - private String INDEX = "/home/me/DBpedia-Lucene-Index"; - private String FIELD = "short-abstract"; - - private IndexReader m_reader = null; - private IndexSearcher m_searcher = null; - private Analyzer m_analyzer = null; - private QueryParser m_parser = null; - - private Map<Document,Float> m_results = null; - - - public static void main( String[] args ) throws Exception { - String sQuery = args[0]; - LuceneSearcher searcher = new LuceneSearcher(); - List<Document> docs = searcher.search( sQuery ); - System.out.println( "\nquery='"+ sQuery +"' all="+ searcher.indexSize() +" hits="+ docs.size() ); -// for( Document doc : docs ) -// { -//// String sDoc = doc.toString(); -// float score = searcher.getScore( doc ); -// System.out.println( "score="+ score +" doc="+ doc ); -// } - } - - @SuppressWarnings("deprecation") - public LuceneSearcher() throws Exception { - m_reader = DirectoryReader.open( FSDirectory.open( new File( INDEX ) )); - m_searcher = new IndexSearcher( m_reader ); - m_analyzer = new StandardAnalyzer( Version.LUCENE_43); - m_parser = new QueryParser( Version.LUCENE_43, FIELD, m_analyzer ); - } - - public LuceneSearcher(IndexReader indexReader) throws Exception { - m_reader = indexReader; - m_searcher = new IndexSearcher( m_reader ); - m_analyzer = new StandardAnalyzer( Version.LUCENE_43); - m_parser = new QueryParser( Version.LUCENE_43, FIELD, m_analyzer ); - } - - public LuceneSearcher(Directory directory, String seachField) throws Exception { - this.FIELD = seachField; - m_reader = DirectoryReader.open(directory); - m_searcher = new IndexSearcher( m_reader ); - m_analyzer = new StandardAnalyzer( Version.LUCENE_43); - m_parser = new QueryParser( Version.LUCENE_43, FIELD, m_analyzer ); - } - - public LuceneSearcher(String indexDirectory) throws Exception { - m_reader = DirectoryReader.open(FSDirectory.open(new File(indexDirectory))); - m_searcher = new IndexSearcher( m_reader ); - m_analyzer = new StandardAnalyzer( Version.LUCENE_43); - m_parser = new QueryParser( Version.LUCENE_43, FIELD, m_analyzer ); - } - - public void close() throws Exception { - m_reader.close(); - } - - public int indexSize(){ - return m_reader.numDocs(); - } - - public List<Document> search( String sQuery ) throws Exception { - m_results = new HashMap<Document,Float>(); - Query query = m_parser.parse( sQuery ); - search( query ); - // m_reader.close(); - return getDocuments(); - } - - public int count( String sQuery ) throws Exception { - return search( sQuery ).size(); - } - - public List<Document> getDocuments(){ - List<Document> docs = new ArrayList<Document>(); - for( Document doc: m_results.keySet() ){ - docs.add( doc ); - } - Collections.sort( docs, new Comparator<Document>(){ - public int compare( Document d1, Document d2 ){ - float s1 = getScore( d1 ); - float s2 = getScore( d2 ); - if( s1 > s2 ) return -1; - else if( s1 < s2 ) return 1; - return 0; - } - @Override - public boolean equals( Object obj ){ - return false; - } - } ); - return docs; - } - - public float getScore( Document doc ){ - return m_results.get( doc ); - } - - private void search( Query query ) throws IOException { - @SuppressWarnings("unused") - Collector collector = new Collector() - { - private Scorer scorer; - private int docBase; - private Map<Document,Float> results = new HashMap<Document,Float>(); - - @Override - public void collect(int doc) throws IOException { - // System.out.println("doc=" + doc + docBase + " score=" + scorer.score()); - m_results.put( m_searcher.doc( doc ), scorer.score() ); - } - @Override - public boolean acceptsDocsOutOfOrder() { - return true; - } - @Override - public void setScorer(Scorer scorer) throws IOException { - this.scorer = scorer; - } - @Override - public void setNextReader(AtomicReaderContext context) throws IOException { - this.docBase = context.docBase; - } - }; - m_searcher.search( query, collector ); - } -} Deleted: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/OWLOntologyLuceneIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/OWLOntologyLuceneIndex.java 2013-07-16 05:25:41 UTC (rev 4020) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/OWLOntologyLuceneIndex.java 2013-07-17 11:44:41 UTC (rev 4021) @@ -1,142 +0,0 @@ -/** - * - */ -package org.dllearner.algorithms.isle; - -import java.io.IOException; -import java.util.HashSet; -import java.util.Set; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.FieldType; -import org.apache.lucene.document.StringField; -import org.apache.lucene.document.TextField; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.store.RAMDirectory; -import org.apache.lucene.util.Version; -import org.semanticweb.owlapi.model.IRI; -import org.semanticweb.owlapi.model.OWLAnnotation; -import org.semanticweb.owlapi.model.OWLAnnotationProperty; -import org.semanticweb.owlapi.model.OWLDataFactory; -import org.semanticweb.owlapi.model.OWLEntity; -import org.semanticweb.owlapi.model.OWLLiteral; -import org.semanticweb.owlapi.model.OWLOntology; -import org.semanticweb.owlapi.vocab.OWLRDFVocabulary; - -import uk.ac.manchester.cs.owl.owlapi.OWLDataFactoryImpl; - -/** - * Creates a Lucene Index for the labels if classes and properties. - * @author Lorenz Buehmann - * - */ -public class OWLOntologyLuceneIndex { - - private Directory directory = new RAMDirectory(); - private OWLOntology ontology; - private Set<OWLEntity> schemaEntities; - - private OWLDataFactory df = new OWLDataFactoryImpl(); - private OWLAnnotationProperty annotationProperty = df.getOWLAnnotationProperty(OWLRDFVocabulary.RDFS_LABEL.getIRI()); - private String language = "en"; - private String searchField; - - public OWLOntologyLuceneIndex(OWLOntology ontology, String searchField) throws IOException { - this.ontology = ontology; - this.searchField = searchField; - - schemaEntities = new HashSet<OWLEntity>(); - schemaEntities.addAll(ontology.getClassesInSignature()); - schemaEntities.addAll(ontology.getObjectPropertiesInSignature()); - schemaEntities.addAll(ontology.getDataPropertiesInSignature()); - - buildIndex(); - } - - public OWLOntologyLuceneIndex(OWLOntology ontology, OWLAnnotationProperty annotationProperty) throws IOException { - this.ontology = ontology; - this.annotationProperty = annotationProperty; - - schemaEntities = new HashSet<OWLEntity>(); - schemaEntities.addAll(ontology.getClassesInSignature()); - schemaEntities.addAll(ontology.getObjectPropertiesInSignature()); - schemaEntities.addAll(ontology.getDataPropertiesInSignature()); - - buildIndex(); - } - - /** - * @return the ontology - */ - public OWLOntology getOntology() { - return ontology; - } - - /** - * @return the directory - */ - public Directory getDirectory() { - return directory; - } - - /** - * @param annotationProperty the annotationProperty to set - */ - public void setAnnotationProperty(OWLAnnotationProperty annotationProperty) { - this.annotationProperty = annotationProperty; - } - - /** - * @param annotationProperty the annotationProperty to set - */ - public void setAnnotationProperty(String annotationPropertyIRI) { - this.annotationProperty = df.getOWLAnnotationProperty(IRI.create(annotationPropertyIRI)); - } - - public void buildIndex() throws IOException{ - Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); - IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer); - IndexWriter writer = new IndexWriter(directory, indexWriterConfig); - System.out.println( "Creating index ..." ); - - Set<Document> luceneDocuments = new HashSet<Document>(); - FieldType stringType = new FieldType(StringField.TYPE_STORED); - stringType.setStoreTermVectors(false); - FieldType textType = new FieldType(TextField.TYPE_STORED); - textType.setStoreTermVectors(false); - - for (OWLEntity entity : schemaEntities) { - String label = null; - Set<OWLAnnotation> annotations = entity.getAnnotations(ontology, annotationProperty); - for (OWLAnnotation annotation : annotations) { - if (annotation.getValue() instanceof OWLLiteral) { - OWLLiteral val = (OWLLiteral) annotation.getValue(); - if (val.hasLang(language)) { - label = val.getLiteral(); - } - } - } - - if(label != null){ - Document luceneDocument = new Document(); - luceneDocument.add(new Field("uri", entity.toStringID(), stringType)); - luceneDocument.add(new Field(searchField, label, textType)); - luceneDocuments.add(luceneDocument); - } - - } - writer.addDocuments(luceneDocuments); - - System.out.println("Done."); - writer.close(); - } - - - -} Deleted: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/PMILuceneBasedRelevance.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/PMILuceneBasedRelevance.java 2013-07-16 05:25:41 UTC (rev 4020) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/PMILuceneBasedRelevance.java 2013-07-17 11:44:41 UTC (rev 4021) @@ -1,48 +0,0 @@ -/** - * Copyright (C) 2007-2011, Jens Lehmann - * - * This file is part of DL-Learner. - * - * DL-Learner is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * DL-Learner is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - - -package org.dllearner.algorithms.isle; - -import org.semanticweb.owlapi.model.OWLOntology; - - -public class PMILuceneBasedRelevance extends LuceneBasedRelevance{ - - /** - * @param ontology - * @param searcher - * @param textRetriever - */ - public PMILuceneBasedRelevance(OWLOntology ontology, LuceneSearcher searcher, EntityTextRetriever textRetriever) { - super(ontology, searcher, textRetriever); - - } - - /* (non-Javadoc) - * @see org.dllearner.algorithms.isle.LuceneBasedRelevance#computeScore(int, int, int, int) - */ - @Override - public double computeScore(int nrOfDocuments, int nrOfDocumentsA, int nrOfDocumentsB, int nrOfDocumentsAB) { - double dPClass = nrOfDocuments == 0 ? 0 : ((double) nrOfDocumentsA / (double) nrOfDocuments); - double dPClassEntity = nrOfDocumentsB == 0 ? 0 : (double) nrOfDocumentsAB / (double) nrOfDocumentsB; - double pmi = Math.log(dPClassEntity / dPClass); - return pmi; - } -} \ No newline at end of file Deleted: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/RDFSCommentEntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/RDFSCommentEntityTextRetriever.java 2013-07-16 05:25:41 UTC (rev 4020) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/RDFSCommentEntityTextRetriever.java 2013-07-17 11:44:41 UTC (rev 4021) @@ -1,26 +0,0 @@ -/** - * - */ -package org.dllearner.algorithms.isle; - -import org.dllearner.kb.OWLAPIOntology; -import org.semanticweb.owlapi.model.OWLOntology; -import org.semanticweb.owlapi.vocab.OWLRDFVocabulary; - -import uk.ac.manchester.cs.owl.owlapi.OWLDataFactoryImpl; - - -/** - * @author Lorenz Buehmann - * - */ -public class RDFSCommentEntityTextRetriever extends AnnotationEntityTextRetriever{ - - public RDFSCommentEntityTextRetriever(OWLOntology ontology) { - super(ontology, new OWLDataFactoryImpl().getOWLAnnotationProperty(OWLRDFVocabulary.RDFS_COMMENT.getIRI())); - } - - public RDFSCommentEntityTextRetriever(OWLAPIOntology ontology) { - super(ontology, new OWLDataFactoryImpl().getOWLAnnotationProperty(OWLRDFVocabulary.RDFS_COMMENT.getIRI())); - } -} Deleted: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/RDFSLabelEntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/RDFSLabelEntityTextRetriever.java 2013-07-16 05:25:41 UTC (rev 4020) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/RDFSLabelEntityTextRetriever.java 2013-07-17 11:44:41 UTC (rev 4021) @@ -1,26 +0,0 @@ -/** - * - */ -package org.dllearner.algorithms.isle; - -import org.dllearner.kb.OWLAPIOntology; -import org.semanticweb.owlapi.model.OWLOntology; -import org.semanticweb.owlapi.vocab.OWLRDFVocabulary; - -import uk.ac.manchester.cs.owl.owlapi.OWLDataFactoryImpl; - - -/** - * @author Lorenz Buehmann - * - */ -public class RDFSLabelEntityTextRetriever extends AnnotationEntityTextRetriever{ - - public RDFSLabelEntityTextRetriever(OWLOntology ontology) { - super(ontology, new OWLDataFactoryImpl().getOWLAnnotationProperty(OWLRDFVocabulary.RDFS_LABEL.getIRI())); - } - - public RDFSLabelEntityTextRetriever(OWLAPIOntology ontology) { - super(ontology, new OWLDataFactoryImpl().getOWLAnnotationProperty(OWLRDFVocabulary.RDFS_LABEL.getIRI())); - } -} Deleted: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/Relevance.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/Relevance.java 2013-07-16 05:25:41 UTC (rev 4020) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/Relevance.java 2013-07-17 11:44:41 UTC (rev 4021) @@ -1,31 +0,0 @@ -/** - * Copyright (C) 2007-2011, Jens Lehmann - * - * This file is part of DL-Learner. - * - * DL-Learner is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * DL-Learner is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - - -package org.dllearner.algorithms.isle; - -import java.util.Map; - -import org.dllearner.core.owl.Entity; - - -public interface Relevance { - - public Map<Entity,Double> getEntityRelevance(Entity entity) throws Exception; -} \ No newline at end of file Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LuceneSyntacticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LuceneSyntacticIndex.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LuceneSyntacticIndex.java 2013-07-17 11:44:41 UTC (rev 4021) @@ -0,0 +1,99 @@ +/** + * + */ +package org.dllearner.algorithms.isle.index; + +import java.io.File; +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TotalHitCountCollector; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.Version; + +/** + * @author Lorenz Buehmann + * + */ +public class LuceneSyntacticIndex implements SyntacticIndex { + + private IndexSearcher searcher; + private QueryParser parser; + private IndexReader indexReader; + private String searchField; + + public LuceneSyntacticIndex(IndexReader indexReader, String searchField) throws Exception { + this.indexReader = indexReader; + this.searchField = searchField; + searcher = new IndexSearcher(indexReader); + StandardAnalyzer analyzer = new StandardAnalyzer( Version.LUCENE_43); + parser = new QueryParser( Version.LUCENE_43, searchField, analyzer ); + } + + public LuceneSyntacticIndex(Directory directory, String seachField) throws Exception { + this(DirectoryReader.open(directory), seachField); + } + + public LuceneSyntacticIndex(String indexDirectory, String seachField) throws Exception { + this(DirectoryReader.open(FSDirectory.open(new File(indexDirectory))), seachField); + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.SyntacticIndex#getDocuments(java.lang.String) + */ + @Override + public Set<String> getDocuments(String searchString) { + Set<String> documents = new HashSet<String>(); + try { + Query query = parser.parse(searchString); + ScoreDoc[] result = searcher.search(query, getSize()).scoreDocs; + for (int i = 0; i < result.length; i++) { + Document doc = searcher.doc(result[i].doc); + documents.add(doc.get(searchField)); + } + } catch (ParseException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + return null; + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.SyntacticIndex#getSize() + */ + @Override + public int getSize() { + return indexReader.numDocs(); + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.SyntacticIndex#count(java.lang.String) + */ + @Override + public int count(String searchString) { + try { + Query query = parser.parse(searchString); + TotalHitCountCollector results = new TotalHitCountCollector(); + searcher.search(query, results); + return results.getTotalHits(); + } catch (ParseException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + return -1; + } + +} Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/OWLOntologyLuceneSyntacticIndexCreator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/OWLOntologyLuceneSyntacticIndexCreator.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/OWLOntologyLuceneSyntacticIndexCreator.java 2013-07-17 11:44:41 UTC (rev 4021) @@ -0,0 +1,101 @@ +/** + * + */ +package org.dllearner.algorithms.isle.index; + +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.Version; +import org.semanticweb.owlapi.model.OWLAnnotation; +import org.semanticweb.owlapi.model.OWLAnnotationProperty; +import org.semanticweb.owlapi.model.OWLDataFactory; +import org.semanticweb.owlapi.model.OWLEntity; +import org.semanticweb.owlapi.model.OWLLiteral; +import org.semanticweb.owlapi.model.OWLOntology; +import org.semanticweb.owlapi.vocab.OWLRDFVocabulary; + +import uk.ac.manchester.cs.owl.owlapi.OWLDataFactoryImpl; + +/** + * Creates a Lucene Index for the labels if classes and properties. + * @author Lorenz Buehmann + * + */ +public class OWLOntologyLuceneSyntacticIndexCreator { + + private Directory directory = new RAMDirectory(); + private OWLOntology ontology; + private Set<OWLEntity> schemaEntities; + + private OWLDataFactory df = new OWLDataFactoryImpl(); + private OWLAnnotationProperty annotationProperty = df.getOWLAnnotationProperty(OWLRDFVocabulary.RDFS_LABEL.getIRI()); + private String language = "en"; + private String searchField; + + public OWLOntologyLuceneSyntacticIndexCreator(OWLOntology ontology, OWLAnnotationProperty annotationProperty, String searchField) throws IOException { + this.ontology = ontology; + this.annotationProperty = annotationProperty; + this.searchField = searchField; + + schemaEntities = new HashSet<OWLEntity>(); + schemaEntities.addAll(ontology.getClassesInSignature()); + schemaEntities.addAll(ontology.getObjectPropertiesInSignature()); + schemaEntities.addAll(ontology.getDataPropertiesInSignature()); + } + + public SyntacticIndex buildIndex() throws Exception{ + Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); + IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer); + IndexWriter writer = new IndexWriter(directory, indexWriterConfig); + System.out.println( "Creating index ..." ); + + Set<Document> luceneDocuments = new HashSet<Document>(); + FieldType stringType = new FieldType(StringField.TYPE_STORED); + stringType.setStoreTermVectors(false); + FieldType textType = new FieldType(TextField.TYPE_STORED); + textType.setStoreTermVectors(false); + + for (OWLEntity entity : schemaEntities) { + String label = null; + Set<OWLAnnotation> annotations = entity.getAnnotations(ontology, annotationProperty); + for (OWLAnnotation annotation : annotations) { + if (annotation.getValue() instanceof OWLLiteral) { + OWLLiteral val = (OWLLiteral) annotation.getValue(); + if (val.hasLang(language)) { + label = val.getLiteral(); + } + } + } + + if(label != null){ + Document luceneDocument = new Document(); + luceneDocument.add(new Field("uri", entity.toStringID(), stringType)); + luceneDocument.add(new Field(searchField, label, textType)); + luceneDocuments.add(luceneDocument); + } + + } + writer.addDocuments(luceneDocuments); + + System.out.println("Done."); + writer.close(); + + return new LuceneSyntacticIndex(directory, searchField); + } + + + +} Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticIndex.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticIndex.java 2013-07-17 11:44:41 UTC (rev 4021) @@ -0,0 +1,35 @@ +/** + * + */ +package org.dllearner.algorithms.isle.index; + +import java.util.Set; + +import org.dllearner.core.owl.Entity; + +/** + * This class + * @author Lorenz Buehmann + * + */ +public interface SemanticIndex { + + /** + * This method returns a set of documents for the given entity. + * @param entity + * @return + */ + Set<String> getDocuments(Entity entity); + /** + * This method returns the number of documents for the given entity. + * @param entity + * @return + */ + int count(Entity entity); + /** + * This methods returns the total number of documents contained in the index. + * @return the total number of documents contained in the index + */ + int getSize(); + +} Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticIndexCreator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticIndexCreator.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticIndexCreator.java 2013-07-17 11:44:41 UTC (rev 4021) @@ -0,0 +1,22 @@ +/** + * + */ +package org.dllearner.algorithms.isle.index; + +/** + * This gets a syntactic index and returns a semantic index by applying WSD etc. + * @author Lorenz Buehmann + * + */ +public class SemanticIndexCreator { + + private SyntacticIndex syntacticIndex; + + public SemanticIndexCreator(SyntacticIndex syntacticIndex) { + this.syntacticIndex = syntacticIndex; + } + + public SemanticIndex createSemanticIndex(){ + return null; + } +} Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleSemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleSemanticIndex.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleSemanticIndex.java 2013-07-17 11:44:41 UTC (rev 4021) @@ -0,0 +1,43 @@ +/** + * + */ +package org.dllearner.algorithms.isle.index; + +import java.util.Set; + +import org.dllearner.core.owl.Entity; + +/** + * @author Lorenz Buehmann + * + */ +public class SimpleSemanticIndex implements SemanticIndex{ + + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.SemanticIndex#getDocuments(org.dllearner.core.owl.Entity) + */ + @Override + public Set<String> getDocuments(Entity entity) { + return null; + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.SemanticIndex#count(java.lang.String) + */ + @Override + public int count(Entity entity) { + return 0; + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.SemanticIndex#getSize() + */ + @Override + public int getSize() { + return 0; + } + + + +} Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SyntacticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SyntacticIndex.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SyntacticIndex.java 2013-07-17 11:44:41 UTC (rev 4021) @@ -0,0 +1,32 @@ +/** + * + */ +package org.dllearner.algorithms.isle.index; + +import java.util.Set; + +/** + * @author Lorenz Buehmann + * + */ +public interface SyntacticIndex { + + /** + * This method returns a set of documents based on how the underlying index is processing the given search string. + * @param searchString + * @return + */ + Set<String> getDocuments(String searchString); + /** + * This method returns the number of documents based on how the underlying index is processing the given search string. + * @param searchString + * @return + */ + int count(String searchString); + /** + * This methods returns the total number of documents contained in the index. + * @return the total number of documents contained in the index + */ + int getSize(); + +} Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/AbstractRelevanceMetric.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/AbstractRelevanceMetric.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/AbstractRelevanceMetric.java 2013-07-17 11:44:41 UTC (rev 4021) @@ -0,0 +1,54 @@ +/** + * + */ +package org.dllearner.algorithms.isle.metrics; + +import java.util.HashMap; +import java.util.Map; + +import org.dllearner.algorithms.isle.index.SemanticIndex; +import org.semanticweb.owlapi.model.OWLEntity; + +/** + * @author Lorenz Buehmann + * + */ +public abstract class AbstractRelevanceMetric implements RelevanceMetric { + + protected SemanticIndex index; + + public AbstractRelevanceMetric(SemanticIndex index) { + this.index = index; + } + + public Map<OWLEntity,Double> normalizeMinMax( Map<OWLEntity,Double> hmEntity2Score ){ + Map<OWLEntity,Double> hmEntity2Norm = new HashMap<OWLEntity,Double>(); + double dMin = Double.MAX_VALUE; + Double dMax = Double.MIN_VALUE; + for( OWLEntity e : hmEntity2Score.keySet() ) + { + double dValue = hmEntity2Score.get(e); + if( dValue < dMin ){ + dMin = dValue; + } + else if( dValue > dMax ){ + dMax = dValue; + } + } + // System.out.println( "min="+ dMin +" max="+ dMax ); + for( OWLEntity e : hmEntity2Score.keySet() ) + { + double dValue = hmEntity2Score.get(e); + double dNorm = 0; + if( dMin == dMax ){ + dNorm = dValue; + } + else { + dNorm = ( dValue - dMin ) / ( dMax - dMin ); + } + hmEntity2Norm.put( e, dNorm ); + } + return hmEntity2Norm; + } + +} Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java 2013-07-17 11:44:41 UTC (rev 4021) @@ -0,0 +1,37 @@ +/** + * + */ +package org.dllearner.algorithms.isle.metrics; + +import java.util.Set; + +import org.dllearner.algorithms.isle.index.SemanticIndex; +import org.dllearner.core.owl.Entity; + +import com.google.common.collect.Sets; + +/** + * @author Lorenz Buehmann + * + */ +public class PMIRelevanceMetric extends AbstractRelevanceMetric { + + public PMIRelevanceMetric(SemanticIndex index) { + super(index); + } + + @Override + public double getRelevance(Entity entityA, Entity entityB){ + Set<String> documentsA = index.getDocuments(entityA); + Set<String> documentsB = index.getDocuments(entityB); + Set<String> documentsAB = Sets.intersection(documentsA, documentsB); + int nrOfDocuments = index.getSize(); + + double dPClass = nrOfDocuments == 0 ? 0 : ((double) documentsA.size() / (double) nrOfDocuments); + double dPClassEntity = documentsB.size() == 0 ? 0 : (double) documentsAB.size() / (double) documentsB.size(); + double pmi = Math.log(dPClassEntity / dPClass); + + return pmi; + } + +} Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/RelevanceMetric.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/RelevanceMetric.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/RelevanceMetric.java 2013-07-17 11:44:41 UTC (rev 4021) @@ -0,0 +1,33 @@ +/** + * Copyright (C) 2007-2011, Jens Lehmann + * + * This file is part of DL-Learner. + * + * DL-Learner is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * DL-Learner is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + + +package org.dllearner.algorithms.isle.metrics; + +import org.dllearner.core.owl.Entity; + + +public interface RelevanceMetric { + /** + * @param entity1 + * @param entity2 + * @return + */ + double getRelevance(Entity entity1, Entity entity2); +} \ No newline at end of file Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/RelevanceUtils.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/RelevanceUtils.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/RelevanceUtils.java 2013-07-17 11:44:41 UTC (rev 4021) @@ -0,0 +1,50 @@ +/** + * + */ +package org.dllearner.algorithms.isle.metrics; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.dllearner.core.owl.Entity; +import org.dllearner.utilities.owl.OWLAPIConverter; +import org.semanticweb.owlapi.model.OWLEntity; +import org.semanticweb.owlapi.model.OWLOntology; + +/** + * @author Lorenz Buehmann + * + */ +public class RelevanceUtils { + + public static Map<Entity, Double> getRelevantEntities(Entity entity, Set<Entity> otherEntities, RelevanceMetric metric){ + Map<Entity, Double> relevantEntities = new HashMap<Entity, Double>(); + + for (Entity otherEntity : otherEntities) { + double relevance = metric.getRelevance(entity, otherEntity); + relevantEntities.put(otherEntity, relevance); + } + + return relevantEntities; + } + + public static Map<Entity, Double> getRelevantEntities(Entity entity, OWLOntology ontology, RelevanceMetric metric){ + Map<Entity, Double> relevantEntities = new HashMap<Entity, Double>(); + + Set<OWLEntity> owlEntities = new HashSet<OWLEntity>(); + owlEntities.addAll(ontology.getClassesInSignature()); + owlEntities.addAll(ontology.getDataPropertiesInSignature()); + owlEntities.addAll(ontology.getObjectPropertiesInSignature()); + Set<Entity> otherEntities = OWLAPIConverter.getEntities(owlEntities); + + for (Entity otherEntity : otherEntities) { + double relevance = metric.getRelevance(entity, otherEntity); + relevantEntities.put(otherEntity, relevance); + } + + return relevantEntities; + } + +} Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-07-17 11:44:41 UTC (rev 4021) @@ -0,0 +1,93 @@ +/** + * + */ +package org.dllearner.algorithms.isle.textretrieval; + +import java.util.HashMap; +import java.util.Map; +import java.util.Set; + +import org.dllearner.core.owl.Entity; +import org.dllearner.kb.OWLAPIOntology; +import org.dllearner.utilities.owl.OWLAPIConverter; +import org.semanticweb.owlapi.model.IRI; +import org.semanticweb.owlapi.model.OWLAnnotation; +import org.semanticweb.owlapi.model.OWLAnnotationProperty; +import org.semanticweb.owlapi.model.OWLEntity; +import org.semanticweb.owlapi.model.OWLLiteral; +import org.semanticweb.owlapi.model.OWLOntology; +import org.semanticweb.owlapi.model.OWLOntologyManager; +import org.semanticweb.owlapi.util.IRIShortFormProvider; +import org.semanticweb.owlapi.util.SimpleIRIShortFormProvider; + + +/** + * @author Lorenz Buehmann + * + */ +public class AnnotationEntityTextRetriever implements EntityTextRetriever{ + + private OWLOntology ontology; + private OWLOntologyManager manager; + + private String language = "en"; + private double weight = 1d; + + private boolean useShortFormFallback = true; + private IRIShortFormProvider sfp = new SimpleIRIShortFormProvider(); + + private OWLAnnotationProperty[] properties; + + public AnnotationEntityTextRetriever(OWLOntology ontology, OWLAnnotationProperty... properties) { + this.ontology = ontology; + this.properties = properties; + } + + public AnnotationEntityTextRetriever(OWLAPIOntology ontology, OWLAnnotationProperty... properties) { + this.ontology = ontology.createOWLOntology(manager); + } + + /** + * @param language the language to set + */ + public void setLanguage(String language) { + this.language = language; + } + + /** + * Whether to use the short form of the IRI as fallback, if no label is given. + * @param useShortFormFallback the useShortFormFallback to set + */ + public void setUseShortFormFallback(boolean useShortFormFallback) { + this.useShortFormFallback = useShortFormFallback; + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.EntityTextRetriever#getRelevantText(org.dllearner.core.owl.Entity) + */ + @Override + public Map<String, Double> getRelevantText(Entity entity) { + Map<String, Double> textWithWeight = new HashMap<String, Double>(); + + OWLEntity e = OWLAPIConverter.getOWLAPIEntity(entity); + + for (OWLAnnotationProperty property : properties) { + Set<OWLAnnotation> annotations = e.getAnnotations(ontology, property); + for (OWLAnnotation annotation : annotations) { + if (annotation.getValue() instanceof OWLLiteral) { + OWLLiteral val = (OWLLiteral) annotation.getValue(); + if (val.hasLang(language)) { + String label = val.getLiteral(); + textWithWeight.put(label, weight); + } + } + } + } + + if(textWithWeight.isEmpty() && useShortFormFallback){ + textWithWeight.put(sfp.getShortForm(IRI.create(entity.getURI())), weight); + } + + return textWithWeight; + } +} Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/EntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/EntityTextRetriever.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/EntityTe... [truncated message content] |
From: <lor...@us...> - 2013-09-03 15:44:29
|
Revision: 4036 http://sourceforge.net/p/dl-learner/code/4036 Author: lorenz_b Date: 2013-09-03 15:44:26 +0000 (Tue, 03 Sep 2013) Log Message: ----------- Added linguistic annotator. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityCandidateGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticAnnotator.java Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityCandidateGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityCandidateGenerator.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityCandidateGenerator.java 2013-09-03 15:44:26 UTC (rev 4036) @@ -0,0 +1,25 @@ +/** + * + */ +package org.dllearner.algorithms.isle; + +import java.util.Set; + +import org.dllearner.algorithms.isle.index.Annotation; +import org.dllearner.core.owl.Entity; +import org.semanticweb.owlapi.model.OWLOntology; + +/** + * @author Lorenz Buehmann + * + */ +public abstract class EntityCandidateGenerator { + + private OWLOntology ontology; + + public EntityCandidateGenerator(OWLOntology ontology) { + this.ontology = ontology; + } + + public abstract Set<Entity> getCandidates(Annotation annotation); +} Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticAnnotator.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticAnnotator.java 2013-09-03 15:44:26 UTC (rev 4036) @@ -0,0 +1,16 @@ +/** + * + */ +package org.dllearner.algorithms.isle.index; + +import java.util.Set; + +/** + * @author Lorenz Buehmann + * + */ +public interface LinguisticAnnotator { + + Set<Annotation> annotate(Document document); + +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java 2013-09-03 15:26:18 UTC (rev 4035) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java 2013-09-03 15:44:26 UTC (rev 4036) @@ -1,5 +1,11 @@ package org.dllearner.algorithms.isle.index; +import java.util.HashSet; +import java.util.Set; + +import org.dllearner.algorithms.isle.EntityCandidateGenerator; +import org.dllearner.algorithms.isle.WordSenseDisambiguation; +import org.dllearner.core.owl.Entity; import org.semanticweb.owlapi.model.OWLOntology; /** @@ -9,15 +15,23 @@ */ public class SemanticAnnotator { - OWLOntology ontology; + private OWLOntology ontology; + private WordSenseDisambiguation wordSenseDisambiguation; + private EntityCandidateGenerator entityCandidateGenerator; + private LinguisticAnnotator linguisticAnnotator; + /** * Initialize this semantic annotator to use the entities from the provided ontology. * * @param ontology the ontology to use entities from */ - public SemanticAnnotator(OWLOntology ontology) { + public SemanticAnnotator(OWLOntology ontology, WordSenseDisambiguation wordSenseDisambiguation, + EntityCandidateGenerator entityCandidateGenerator, LinguisticAnnotator linguisticAnnotator) { this.ontology = ontology; + this.wordSenseDisambiguation = wordSenseDisambiguation; + this.entityCandidateGenerator = entityCandidateGenerator; + this.linguisticAnnotator = linguisticAnnotator; } /** @@ -26,7 +40,16 @@ * @param document the document to annotate * @return the given document extended with annotations */ - public AnnotatedDocument processDocument(Document document){ - return null; + public AnnotatedDocument processDocument(TextDocument document){ + Set<Annotation> annotations = linguisticAnnotator.annotate(document); + Set<SemanticAnnotation> semanticAnnotations = new HashSet<SemanticAnnotation>(); + for (Annotation annotation : annotations) { + Set<Entity> candidateEntities = entityCandidateGenerator.getCandidates(annotation); + SemanticAnnotation semanticAnnotation = wordSenseDisambiguation.disambiguate(annotation, candidateEntities); + semanticAnnotations.add(semanticAnnotation); + + } + AnnotatedDocument annotatedDocument = new AnnotatedTextDocument(document, semanticAnnotations); + return annotatedDocument; } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <jen...@us...> - 2013-09-03 16:09:31
|
Revision: 4039 http://sourceforge.net/p/dl-learner/code/4039 Author: jenslehmann Date: 2013-09-03 16:09:27 +0000 (Tue, 03 Sep 2013) Log Message: ----------- implemented random WSD as baseline method Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotation.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/RandomWordSenseDisambiguation.java Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/RandomWordSenseDisambiguation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/RandomWordSenseDisambiguation.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/RandomWordSenseDisambiguation.java 2013-09-03 16:09:27 UTC (rev 4039) @@ -0,0 +1,59 @@ +/** + * Copyright (C) 2007-2013, Jens Lehmann + * + * This file is part of DL-Learner. + * + * DL-Learner is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * DL-Learner is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +package org.dllearner.algorithms.isle; + +import java.util.Random; +import java.util.Set; + +import org.dllearner.algorithms.isle.index.Annotation; +import org.dllearner.algorithms.isle.index.SemanticAnnotation; +import org.dllearner.core.owl.Entity; +import org.semanticweb.owlapi.model.OWLOntology; + +/** + * Disambiguation by randomly selecting one of the candidates (baseline method). + * + * @author Jens Lehmann + * + */ +public class RandomWordSenseDisambiguation extends WordSenseDisambiguation { + + private Random random; + + public RandomWordSenseDisambiguation(OWLOntology ontology) { + super(ontology); + random = new Random(); + } + + @Override + public SemanticAnnotation disambiguate(Annotation annotation, + Set<Entity> candidateEntities) { + int pos = random.nextInt(candidateEntities.size()); + int i = 0; + for(Entity e : candidateEntities) + { + if (i == pos) { + return new SemanticAnnotation(annotation, e); + } + i++; + } + return null; + } + +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotation.java 2013-09-03 15:55:04 UTC (rev 4038) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotation.java 2013-09-03 16:09:27 UTC (rev 4039) @@ -13,6 +13,11 @@ private Entity entity; + public SemanticAnnotation(Annotation annotation, Entity entity) { + super(annotation.getGetReferencedDocument(), annotation.getOffset(), annotation.getLength()); + this.entity = entity; + } + public SemanticAnnotation(Document getReferencedDocument, Entity entity, int offset, int length) { super(getReferencedDocument, offset, length); this.entity = entity; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-09-03 16:48:19
|
Revision: 4040 http://sourceforge.net/p/dl-learner/code/4040 Author: lorenz_b Date: 2013-09-03 16:48:16 +0000 (Tue, 03 Sep 2013) Log Message: ----------- Added first implementation of semantic index. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java Removed Paths: ------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndexFactory.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java 2013-09-03 16:09:27 UTC (rev 4039) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java 2013-09-03 16:48:16 UTC (rev 4040) @@ -6,7 +6,6 @@ import org.dllearner.algorithms.isle.EntityCandidateGenerator; import org.dllearner.algorithms.isle.WordSenseDisambiguation; import org.dllearner.core.owl.Entity; -import org.semanticweb.owlapi.model.OWLOntology; /** * Provides methods to annotate documents. @@ -15,7 +14,6 @@ */ public class SemanticAnnotator { - private OWLOntology ontology; private WordSenseDisambiguation wordSenseDisambiguation; private EntityCandidateGenerator entityCandidateGenerator; private LinguisticAnnotator linguisticAnnotator; @@ -26,9 +24,8 @@ * * @param ontology the ontology to use entities from */ - public SemanticAnnotator(OWLOntology ontology, WordSenseDisambiguation wordSenseDisambiguation, + public SemanticAnnotator(WordSenseDisambiguation wordSenseDisambiguation, EntityCandidateGenerator entityCandidateGenerator, LinguisticAnnotator linguisticAnnotator) { - this.ontology = ontology; this.wordSenseDisambiguation = wordSenseDisambiguation; this.entityCandidateGenerator = entityCandidateGenerator; this.linguisticAnnotator = linguisticAnnotator; Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-09-03 16:09:27 UTC (rev 4039) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-09-03 16:48:16 UTC (rev 4040) @@ -1,11 +1,19 @@ package org.dllearner.algorithms.isle.index.semantic; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.dllearner.algorithms.isle.EntityCandidateGenerator; +import org.dllearner.algorithms.isle.WordSenseDisambiguation; import org.dllearner.algorithms.isle.index.AnnotatedDocument; -import org.dllearner.algorithms.isle.index.Document; +import org.dllearner.algorithms.isle.index.LinguisticAnnotator; +import org.dllearner.algorithms.isle.index.SemanticAnnotator; +import org.dllearner.algorithms.isle.index.TextDocument; +import org.dllearner.algorithms.isle.index.syntactic.SyntacticIndex; import org.dllearner.core.owl.Entity; +import org.semanticweb.owlapi.model.OWLOntology; -import java.util.Set; - /** * Interface for an index which is able to resolve a given entity's URI to the set of documents containing * this entity, i.e., documents which contain words disambiguated to the given entity. @@ -13,14 +21,51 @@ * @author Lorenz Buehmann * @author Daniel Fleischhacker */ -public interface SemanticIndex { +public abstract class SemanticIndex { + + private SemanticAnnotator semanticAnnotator; + private SyntacticIndex syntacticIndex; + private Map<Entity, Set<AnnotatedDocument>> index; + private OWLOntology ontology; + + public SemanticIndex(OWLOntology ontology, SyntacticIndex syntacticIndex, WordSenseDisambiguation wordSenseDisambiguation, + EntityCandidateGenerator entityCandidateGenerator, LinguisticAnnotator linguisticAnnotator) { + this.ontology = ontology; + this.syntacticIndex = syntacticIndex; + semanticAnnotator = new SemanticAnnotator(wordSenseDisambiguation, entityCandidateGenerator, linguisticAnnotator); + } + + public SemanticIndex(OWLOntology ontology, SyntacticIndex syntacticIndex, SemanticAnnotator semanticAnnotator) { + this.semanticAnnotator = semanticAnnotator; + } + + /** + * Precompute the whole index, i.e. iterate over all entities and compute all annotated documents. + */ + public void buildIndex(Set<TextDocument> documents){ + for (TextDocument document : documents) { + AnnotatedDocument annotatedDocument = semanticAnnotator.processDocument(document); + for (Entity entity : annotatedDocument.getContainedEntities()) { + Set<AnnotatedDocument> existingAnnotatedDocuments = index.get(entity); + if(existingAnnotatedDocuments == null){ + existingAnnotatedDocuments = new HashSet<AnnotatedDocument>(); + index.put(entity, existingAnnotatedDocuments); + } + existingAnnotatedDocuments.add(annotatedDocument); + } + } + } + /** * Returns the set of annotated documents which reference the given entity using one of its surface forms. * * @param entity entity to retrieve documents * @return documents referencing given entity */ - public Set<AnnotatedDocument> getDocuments(Entity entity); + public Set<AnnotatedDocument> getDocuments(Entity entity){ + Set<AnnotatedDocument> annotatedDocuments = index.get(entity); + return annotatedDocuments; + } /** * Returns the number of documents for the given entity. @@ -28,12 +73,16 @@ * @param entity entity to return number of referencing documents for * @return number of documents for the given entity in this index */ - public int count(Entity entity); + public int count(Entity entity){ + return index.get(entity).size(); + } /** * Returns the total number of documents contained in the index. * * @return the total number of documents contained in the index */ - public int getSize(); + public int getSize(){ + return index.size(); + } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java 2013-09-03 16:09:27 UTC (rev 4039) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java 2013-09-03 16:48:16 UTC (rev 4040) @@ -3,28 +3,20 @@ */ package org.dllearner.algorithms.isle.index.semantic.simple; -import org.dllearner.algorithms.isle.index.AnnotatedDocument; -import org.dllearner.algorithms.isle.index.Document; +import org.dllearner.algorithms.isle.RandomWordSenseDisambiguation; +import org.dllearner.algorithms.isle.index.SimpleEntityCandidateGenerator; +import org.dllearner.algorithms.isle.index.SimpleLinguisticAnnotator; +import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; import org.dllearner.algorithms.isle.index.syntactic.SyntacticIndex; -import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; -import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever; -import org.dllearner.core.owl.Entity; import org.semanticweb.owlapi.model.OWLOntology; -import java.util.HashSet; -import java.util.Map; -import java.util.Map.Entry; -import java.util.Set; - /** * A semantic index which returns all documents which contain at least one of the labels assigned to a specific * entity in a provided ontology. * * @author Lorenz Buehmann */ -public class SimpleSemanticIndex implements SemanticIndex { - private SyntacticIndex syntacticIndex; - private RDFSLabelEntityTextRetriever labelRetriever; +public class SimpleSemanticIndex extends SemanticIndex { /** * Initializes the semantic index to use {@code ontology} for finding all labels of an entity and @@ -34,41 +26,11 @@ * @param syntacticIndex index to query for documents containing the labels */ public SimpleSemanticIndex(OWLOntology ontology, SyntacticIndex syntacticIndex) { - this.syntacticIndex = syntacticIndex; - labelRetriever = new RDFSLabelEntityTextRetriever(ontology); + super(ontology, + syntacticIndex, + new RandomWordSenseDisambiguation(ontology), + new SimpleEntityCandidateGenerator(ontology), + new SimpleLinguisticAnnotator()); } - /* (non-Javadoc) - * @see org.dllearner.algorithms.isle.SemanticIndex#getDocuments(org.dllearner.core.owl.Entity) - */ - @Override - public Set<AnnotatedDocument> getDocuments(Entity entity) { - Set<AnnotatedDocument> documents = new HashSet<AnnotatedDocument>(); - Map<String, Double> relevantText = labelRetriever.getRelevantText(entity); - - for (Entry<String, Double> entry : relevantText.entrySet()) { - String label = entry.getKey(); - documents.addAll(syntacticIndex.getDocuments(label)); - } - - return documents; - } - - /* (non-Javadoc) - * @see org.dllearner.algorithms.isle.SemanticIndex#count(java.lang.String) - */ - @Override - public int count(Entity entity) { - return getDocuments(entity).size(); - } - - /* (non-Javadoc) - * @see org.dllearner.algorithms.isle.SemanticIndex#getSize() - */ - @Override - public int getSize() { - return syntacticIndex.getSize(); - } - - } Deleted: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndexFactory.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndexFactory.java 2013-09-03 16:09:27 UTC (rev 4039) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndexFactory.java 2013-09-03 16:48:16 UTC (rev 4040) @@ -1,40 +0,0 @@ -/** - * - */ -package org.dllearner.algorithms.isle.index.semantic.simple; - -import org.dllearner.algorithms.isle.index.syntactic.SyntacticIndex; -import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; -import org.dllearner.algorithms.isle.index.semantic.SemanticIndexFactory; -import org.semanticweb.owlapi.model.OWLOntology; - -import java.io.File; - -/** - * This gets a syntactic index and returns a semantic index by applying WSD etc. - * - * @author Lorenz Buehmann - * @author Daniel Fleischhacker - */ -public class SimpleSemanticIndexFactory implements SemanticIndexFactory { - private OWLOntology ontology; - private SyntacticIndex syntacticIndex; - - /** - * Initializes a semantic index factory for creating simple semantic indexes. Simple semantic indexes use - * the labels assigned to an entity in {@code ontology} as its surface forms and return the all documents - * from the given syntactic index which contain at least one of these surface forms. - * - * @param syntacticIndex the syntactic index in which occurrences of the labels are searched - * @param ontology the ontology retrieve the entities' labels from - */ - public SimpleSemanticIndexFactory(SyntacticIndex syntacticIndex, OWLOntology ontology) { - this.syntacticIndex = syntacticIndex; - this.ontology = ontology; - } - - @Override - public SemanticIndex createIndex(File inputDirectory) { - return new SimpleSemanticIndex(ontology, syntacticIndex); - } -} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java 2013-09-03 16:09:27 UTC (rev 4039) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java 2013-09-03 16:48:16 UTC (rev 4040) @@ -3,12 +3,13 @@ */ package org.dllearner.algorithms.isle.metrics; -import com.google.common.collect.Sets; -import org.dllearner.algorithms.isle.index.Document; +import java.util.Set; + +import org.dllearner.algorithms.isle.index.AnnotatedDocument; import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; import org.dllearner.core.owl.Entity; -import java.util.Set; +import com.google.common.collect.Sets; /** * @author Lorenz Buehmann @@ -22,9 +23,9 @@ @Override public double getRelevance(Entity entityA, Entity entityB){ - Set<Document> documentsA = index.getDocuments(entityA); - Set<Document> documentsB = index.getDocuments(entityB); - Set<Document> documentsAB = Sets.intersection(documentsA, documentsB); + Set<AnnotatedDocument> documentsA = index.getDocuments(entityA); + Set<AnnotatedDocument> documentsB = index.getDocuments(entityB); + Set<AnnotatedDocument> documentsAB = Sets.intersection(documentsA, documentsB); int nrOfDocuments = index.getSize(); double dPClass = nrOfDocuments == 0 ? 0 : ((double) documentsA.size() / (double) nrOfDocuments); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-09-04 09:41:46
|
Revision: 4050 http://sourceforge.net/p/dl-learner/code/4050 Author: lorenz_b Date: 2013-09-04 09:41:41 +0000 (Wed, 04 Sep 2013) Log Message: ----------- Added simple stop word filtering. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/StopWordFilter.java Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/StopWordFilter.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/StopWordFilter.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/StopWordFilter.java 2013-09-04 09:41:41 UTC (rev 4050) @@ -0,0 +1,56 @@ +/** + * + */ +package org.dllearner.algorithms.isle; + +import java.io.File; +import java.io.IOException; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; + +import org.dllearner.algorithms.isle.index.Annotation; + +import com.google.common.base.Charsets; +import com.google.common.io.Files; + +/** + * @author Lorenz Buehmann + * + */ +public class StopWordFilter { + + private Set<String> stopWords; + private static final String stopWordfile = "src/main/resources/stopwords.txt"; + + public StopWordFilter() { + try { + stopWords = new HashSet<String>(Files.readLines(new File(stopWordfile), Charsets.UTF_8)); + } catch (IOException e) { + e.printStackTrace(); + } + } + + public String removeStopWords(String input) { + for (String s : stopWords) { + input = input.replaceAll("\\b" + s + "\\b", ""); + } + return input; + } + + public void removeStopWords(Set<String> words) { + words.removeAll(stopWords); + } + + public void removeStopWordAnnotations(Set<Annotation> annotations) { + for (Iterator<Annotation> iter = annotations.iterator(); iter.hasNext();) { + Annotation annotation = iter.next(); + String content = annotation.getGetReferencedDocument().getContent(); + String token = content.substring(annotation.getOffset(), annotation.getOffset()+annotation.getLength()); + if(stopWords.contains(token)){ + iter.remove(); + } + } + } + +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java 2013-09-04 09:41:35 UTC (rev 4049) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java 2013-09-04 09:41:41 UTC (rev 4050) @@ -1,22 +1,35 @@ package org.dllearner.algorithms.isle.index; +import java.io.IOException; +import java.io.StringReader; import java.util.HashSet; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.en.PorterStemFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.util.Version; +import org.dllearner.algorithms.isle.StopWordFilter; + /** * * @author Jens Lehmann * */ public class SimpleLinguisticAnnotator implements LinguisticAnnotator { + + private StopWordFilter stopWordFilter = new StopWordFilter(); @Override public Set<Annotation> annotate(Document document) { String s = document.getRawContent().trim(); + System.out.println("Document:" + s); +// s = stopWordFilter.removeStopWords(s); Set<Annotation> annotations = new HashSet<Annotation>(); - Pattern pattern = Pattern.compile("\\u0020+"); + Pattern pattern = Pattern.compile("(\\u0020)+"); Matcher matcher = pattern.matcher(s); // Check all occurrences int start = 0; @@ -28,7 +41,20 @@ if(start < s.length()-1){ annotations.add(new Annotation(document, start, s.length() - start)); } + stopWordFilter.removeStopWordAnnotations(annotations); return annotations; } + + public static void main(String[] args) throws Exception { + String s = "male person least 1 child"; + Pattern pattern = Pattern.compile("(\\u0020)+"); + Matcher matcher = pattern.matcher(s); + int start = 0; + while (matcher.find()) { + int end = matcher.start(); + System.out.println(s.substring(start, end)); + start = matcher.end(); + } + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-09-04 14:28:16
|
Revision: 4061 http://sourceforge.net/p/dl-learner/code/4061 Author: dfleischhacker Date: 2013-09-04 14:28:12 +0000 (Wed, 04 Sep 2013) Log Message: ----------- Add class providing wrapper for common linguistic operations Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java 2013-09-04 14:26:47 UTC (rev 4060) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java 2013-09-04 14:28:12 UTC (rev 4061) @@ -4,6 +4,7 @@ import java.util.ArrayList; import java.util.Iterator; import java.util.List; +import java.util.Set; import net.didion.jwnl.JWNL; import net.didion.jwnl.JWNLException; @@ -71,8 +72,31 @@ } return synonyms; } - - public List<String> getSisterTerms(POS pos, String s){ + + public List<String> getAllSynonyms(POS pos, String s) { + List<String> synonyms = new ArrayList<String>(); + try { + IndexWord iw = dict.getIndexWord(pos, s); + if (iw != null) { + Synset[] synsets = iw.getSenses(); + for (Synset synset : synsets) { + for (Word w : synset.getWords()) { + String lemma = w.getLemma(); + if (!lemma.equals(s) && !lemma.contains(" ")) { + synonyms.add(lemma); + } + } + } + } + } + catch (JWNLException e) { + e.printStackTrace(); + } + + return synonyms; + } + + public List<String> getSisterTerms(POS pos, String s){ List<String> sisterTerms = new ArrayList<String>(); try { Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-09-04 14:28:12 UTC (rev 4061) @@ -0,0 +1,78 @@ +package org.dllearner.algorithms.isle.index; + +import net.didion.jwnl.data.POS; +import org.dllearner.algorithms.isle.WordNet; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Provides shortcuts to + * @author Daniel Fleischhacker + */ +public class LinguisticUtil { + private static final WordNet wn = new WordNet(); + private static POS[] RELEVANT_POS = new POS[]{POS.NOUN, POS.VERB}; + + /** + * Processes the given string and puts camelCased words into single words. + * @param camelCase the word containing camelcase to split + * @return all words as camelcase contained in the given word + */ + public static String[] getWordsFromCamelCase(String camelCase) { + ArrayList<String> resultingWords = new ArrayList<String>(); + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < camelCase.length(); i++) { + // we just ignore characters not matching the defined pattern + char curChar = camelCase.charAt(i); + if (!Character.isLetter(curChar)) { + continue; + } + if (Character.isUpperCase(curChar)) { // found a new upper case letter + resultingWords.add(sb.toString()); + sb = new StringBuilder(); + sb.append(Character.toLowerCase(curChar)); + } + else { // lower case letter + sb.append(curChar); + } + } + + if (sb.length() > 0) { + resultingWords.add(sb.toString()); + } + + return resultingWords.toArray(new String[resultingWords.size()]); + } + + /** + * Split word into words it contains divided by underscores. + * + * @param underScored word to split at underscores + * @return words contained in given word + */ + public static String[] getWordsFromUnderscored(String underScored) { + return underScored.split("_"); + } + + // get synonyms + public static String[] getSynonymsForWord(String word) { + ArrayList<String> synonyms = new ArrayList<String>(); + + for (POS pos : RELEVANT_POS) { + synonyms.addAll(wn.getAllSynonyms(pos, word)); + } + return synonyms.toArray(new String[synonyms.size()]); + } + + public static void main(String[] args) { + for (String s : getWordsFromCamelCase("thisIsAClassWith1Name123")) { + System.out.println(s); + for (String w : getSynonymsForWord(s)) { + System.out.println(" --> " + w); + } + } + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-09-05 09:10:36
|
Revision: 4087 http://sourceforge.net/p/dl-learner/code/4087 Author: lorenz_b Date: 2013-09-05 09:10:33 +0000 (Thu, 05 Sep 2013) Log Message: ----------- Added printing. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/ISLE.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/NLPHeuristic.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/ISLE.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/ISLE.java 2013-09-05 08:52:49 UTC (rev 4086) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/ISLE.java 2013-09-05 09:10:33 UTC (rev 4087) @@ -81,7 +81,7 @@ @ComponentAnn(name="ISLE", shortName="isle", version=0.5, description="CELOE is an adapted and extended version of the OCEL algorithm applied for the ontology engineering use case. See http://jens-lehmann.org/files/2011/celoe.pdf for reference.") public class ISLE extends AbstractCELA { - private static Logger logger = Logger.getLogger(CELOE.class); + private static Logger logger = Logger.getLogger(ISLE.class); // private CELOEConfigurator configurator; private boolean isRunning = false; Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/NLPHeuristic.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/NLPHeuristic.java 2013-09-05 08:52:49 UTC (rev 4086) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/NLPHeuristic.java 2013-09-05 09:10:33 UTC (rev 4087) @@ -107,19 +107,19 @@ score -= node.getRefinementCount() * nodeRefinementPenalty; - //the NLP based scoring -// Description expression = node.getExpression(); -//// OWLClassExpression owlapiDescription = OWLAPIConverter.getOWLAPIDescription(expression); -//// Set<Entity> entities = OWLAPIConverter.getEntities(owlapiDescription.getSignature()); -// Set<Entity> entities = expression.getSignature(); -// double sum = 0; -// for (Entity entity : entities) { -// double relevance = entityRelevance.containsKey(entity) ? entityRelevance.get(entity) : 0; -// if(!Double.isInfinite(relevance)){ -// sum += relevance; -// } -// } -// score += nlpBonusFactor * sum; +// the NLP based scoring + Description expression = node.getExpression();System.out.println(expression); +// OWLClassExpression owlapiDescription = OWLAPIConverter.getOWLAPIDescription(expression); +// Set<Entity> entities = OWLAPIConverter.getEntities(owlapiDescription.getSignature()); + Set<Entity> entities = expression.getSignature(); + double sum = 0; + for (Entity entity : entities) { + double relevance = entityRelevance.containsKey(entity) ? entityRelevance.get(entity) : 0;System.out.println(entity + ":" + relevance); + if(!Double.isInfinite(relevance)){ + sum += relevance; + } + } + score += nlpBonusFactor * sum; return score; } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-09-05 08:52:49 UTC (rev 4086) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-09-05 09:10:33 UTC (rev 4087) @@ -75,6 +75,7 @@ } existingAnnotatedDocuments.add(annotatedDocument); } + logger.info("Annotated document:" + annotatedDocument); } logger.info("...done."); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java 2013-09-05 08:52:49 UTC (rev 4086) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java 2013-09-05 09:10:33 UTC (rev 4087) @@ -31,6 +31,7 @@ public SimpleSemanticIndex(OWLOntology ontology, SyntacticIndex syntacticIndex) { super(ontology); SimpleEntityCandidatesTrie trie = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), ontology); + trie.printTrie(); setSemanticAnnotator(new SemanticAnnotator( new SimpleWordSenseDisambiguation(ontology), new TrieEntityCandidateGenerator(ontology, trie), Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-09-05 08:52:49 UTC (rev 4086) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-09-05 09:10:33 UTC (rev 4087) @@ -82,7 +82,7 @@ if (annotation.getValue() instanceof OWLLiteral) { OWLLiteral val = (OWLLiteral) annotation.getValue(); if (val.hasLang(language)) { - String label = val.getLiteral(); + String label = val.getLiteral().trim(); textWithWeight.put(label, weight); } } @@ -92,7 +92,7 @@ if(textWithWeight.isEmpty() && useShortFormFallback){ String shortForm = sfp.getShortForm(IRI.create(entity.getURI())); shortForm = Joiner.on(" ").join(LinguisticUtil.getWordsFromCamelCase(shortForm)); - shortForm = Joiner.on(" ").join(LinguisticUtil.getWordsFromUnderscored(shortForm)); + shortForm = Joiner.on(" ").join(LinguisticUtil.getWordsFromUnderscored(shortForm)).trim(); textWithWeight.put(shortForm, weight); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-09-06 10:01:56
|
Revision: 4092 http://sourceforge.net/p/dl-learner/code/4092 Author: dfleischhacker Date: 2013-09-06 10:01:53 +0000 (Fri, 06 Sep 2013) Log Message: ----------- Add methods to get top n synonyms for words Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java 2013-09-05 13:59:47 UTC (rev 4091) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java 2013-09-06 10:01:53 UTC (rev 4092) @@ -1,78 +1,115 @@ package org.dllearner.algorithms.isle; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Set; - import net.didion.jwnl.JWNL; import net.didion.jwnl.JWNLException; -import net.didion.jwnl.data.IndexWord; -import net.didion.jwnl.data.POS; -import net.didion.jwnl.data.PointerTarget; -import net.didion.jwnl.data.PointerUtils; -import net.didion.jwnl.data.Synset; -import net.didion.jwnl.data.Word; +import net.didion.jwnl.data.*; import net.didion.jwnl.data.list.PointerTargetNode; import net.didion.jwnl.data.list.PointerTargetNodeList; import net.didion.jwnl.dictionary.Dictionary; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + public class WordNet { - - public Dictionary dict; - - public WordNet() { - try { - JWNL.initialize(this.getClass().getClassLoader().getResourceAsStream("wordnet_properties.xml")); - dict = Dictionary.getInstance(); - } catch (JWNLException e) { - e.printStackTrace(); - } - } - - public WordNet(String configPath) { - try { - JWNL.initialize(this.getClass().getClassLoader().getResourceAsStream(configPath)); - dict = Dictionary.getInstance(); - } catch (JWNLException e) { - e.printStackTrace(); - } - } - - public WordNet(InputStream propertiesStream) { - try { - JWNL.initialize(propertiesStream); - dict = Dictionary.getInstance(); - } catch (JWNLException e) { - e.printStackTrace(); - } - } - - public List<String> getBestSynonyms(POS pos, String s) { - - List<String> synonyms = new ArrayList<String>(); - - try { - IndexWord iw = dict.getIndexWord(pos, s);//dict.getMorphologicalProcessor().lookupBaseForm(pos, s) + + public Dictionary dict; + + public WordNet() { + try { + JWNL.initialize(this.getClass().getClassLoader().getResourceAsStream("wordnet_properties.xml")); + dict = Dictionary.getInstance(); + } + catch (JWNLException e) { + e.printStackTrace(); + } + } + + public WordNet(String configPath) { + try { + JWNL.initialize(this.getClass().getClassLoader().getResourceAsStream(configPath)); + dict = Dictionary.getInstance(); + } + catch (JWNLException e) { + e.printStackTrace(); + } + } + + public WordNet(InputStream propertiesStream) { + try { + JWNL.initialize(propertiesStream); + dict = Dictionary.getInstance(); + } + catch (JWNLException e) { + e.printStackTrace(); + } + } + + public static void main(String[] args) { + System.out.println(new WordNet().getBestSynonyms(POS.VERB, "learn")); + System.out.println(new WordNet().getSisterTerms(POS.NOUN, "actress")); + } + + public List<String> getBestSynonyms(POS pos, String s) { + + List<String> synonyms = new ArrayList<String>(); + + try { + IndexWord iw = dict.getIndexWord(pos, s);//dict.getMorphologicalProcessor().lookupBaseForm(pos, s) // IndexWord iw = dict.getMorphologicalProcessor().lookupBaseForm(pos, s); - if(iw != null){ - Synset[] synsets = iw.getSenses(); - Word[] words = synsets[0].getWords(); - for(Word w : words){ - String c = w.getLemma(); - if (!c.equals(s) && !c.contains(" ") && synonyms.size() < 4) { - synonyms.add(c); - } - } - } - - } catch (JWNLException e) { - e.printStackTrace(); - } - return synonyms; - } + if (iw != null) { + Synset[] synsets = iw.getSenses(); + Word[] words = synsets[0].getWords(); + for (Word w : words) { + String c = w.getLemma(); + if (!c.equals(s) && !c.contains(" ") && synonyms.size() < 4) { + synonyms.add(c); + } + } + } + } + catch (JWNLException e) { + e.printStackTrace(); + } + return synonyms; + } + + /** + * Returns the lemmas for the top {@code n} synsets of the given POS for the string {@code s}. + * + * @param pos the part of speech to retrieve synonyms for + * @param s the string to retrieve synonyms for + * @param n the number of synonyms to retrieve + * @return list of the lemmas of the top n synonyms of s + */ + public List<String> getTopSynonyms(POS pos, String s, int n) { + + List<String> synonyms = new ArrayList<String>(); + + try { + IndexWord iw = dict.getIndexWord(pos, s);//dict.getMorphologicalProcessor().lookupBaseForm(pos, s) +// IndexWord iw = dict.getMorphologicalProcessor().lookupBaseForm(pos, s); + if (iw != null) { + Synset[] synsets = iw.getSenses(); + for (int i = 0; i < n; i++) { + for (Word word : synsets[i].getWords()) { + String c = word.getLemma(); + if (!c.equals(s) && !c.contains(" ")) { + synonyms.add(c); + } + } + } + } + + } + catch (JWNLException e) { + e.printStackTrace(); + } + return synonyms; + } + public List<String> getAllSynonyms(POS pos, String s) { List<String> synonyms = new ArrayList<String>(); try { @@ -96,120 +133,124 @@ return synonyms; } - public List<String> getSisterTerms(POS pos, String s){ - List<String> sisterTerms = new ArrayList<String>(); - - try { - IndexWord iw = dict.getIndexWord(pos, s);//dict.getMorphologicalProcessor().lookupBaseForm(pos, s) + public List<String> getSisterTerms(POS pos, String s) { + List<String> sisterTerms = new ArrayList<String>(); + + try { + IndexWord iw = dict.getIndexWord(pos, s);//dict.getMorphologicalProcessor().lookupBaseForm(pos, s) // IndexWord iw = dict.getMorphologicalProcessor().lookupBaseForm(pos, s); - if(iw != null){ - Synset[] synsets = iw.getSenses(); - //System.out.println(synsets[0]); - PointerTarget[] pointerArr = synsets[0].getTargets(); - } - - } catch (JWNLException e) { - e.printStackTrace(); - } - return sisterTerms; - } - - public List<String> getAttributes(String s) { - - List<String> result = new ArrayList<String>(); - - try { - IndexWord iw = dict.getIndexWord(POS.ADJECTIVE, s); - if(iw != null){ - Synset[] synsets = iw.getSenses(); - Word[] words = synsets[0].getWords(); - for(Word w : words){ - String c = w.getLemma(); - if (!c.equals(s) && !c.contains(" ") && result.size() < 4) { - result.add(c); - } - } - } - - } catch (JWNLException e) { - e.printStackTrace(); - } - - return result; - } - - public static void main(String[] args) { - System.out.println(new WordNet().getBestSynonyms(POS.VERB, "learn")); - System.out.println(new WordNet().getSisterTerms(POS.NOUN, "actress")); - } - - /** - * Funktion returns a List of Hypo and Hypernyms of a given string - * @param s Word for which you want to get Hypo and Hypersyms - * @return List of Hypo and Hypernyms - * @throws JWNLException - */ - public List<String> getRelatedNouns(String s) { - List<String> result = new ArrayList<String>(); - IndexWord word = null; - Synset sense=null; - try{ - word=dict.getIndexWord(POS.NOUN,s); - if(word!=null){ - sense = word.getSense(1); - //Synset sense = word.getSense(1); - - PointerTargetNodeList relatedListHypernyms = null; - PointerTargetNodeList relatedListHyponyms = null; - try { - relatedListHypernyms = PointerUtils.getInstance().getDirectHypernyms(sense); - } catch (JWNLException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - try { - relatedListHyponyms = PointerUtils.getInstance().getDirectHyponyms(sense); - } catch (JWNLException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - - Iterator i = relatedListHypernyms.iterator(); - while (i.hasNext()) { - PointerTargetNode related = (PointerTargetNode) i.next(); - Synset s1 = related.getSynset(); - String tmp=(s1.toString()).replace(s1.getGloss(), ""); - tmp=tmp.replace(" -- ()]",""); - tmp=tmp.replaceAll("[0-9]",""); - tmp=tmp.replace("[Synset: [Offset: ",""); - tmp=tmp.replace("] [POS: noun] Words: ",""); - //its possible, that there is more than one word in a line from wordnet - String[] array_tmp=tmp.split(","); - for(String z : array_tmp) result.add(z.replace(" ", "")); - } - - Iterator j = relatedListHyponyms.iterator(); - while (j.hasNext()) { - PointerTargetNode related = (PointerTargetNode) j.next(); - Synset s1 = related.getSynset(); - String tmp=(s1.toString()).replace(s1.getGloss(), ""); - tmp=tmp.replace(" -- ()]",""); - tmp=tmp.replaceAll("[0-9]",""); - tmp=tmp.replace("[Synset: [Offset: ",""); - tmp=tmp.replace("] [POS: noun] Words: ",""); - //its possible, that there is more than one word in a line from wordnet - String[] array_tmp=tmp.split(","); - for(String z : array_tmp) result.add(z.replace(" ", "")); - } - } - }catch (JWNLException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - - - - return result; - } - + if (iw != null) { + Synset[] synsets = iw.getSenses(); + //System.out.println(synsets[0]); + PointerTarget[] pointerArr = synsets[0].getTargets(); + } + + } + catch (JWNLException e) { + e.printStackTrace(); + } + return sisterTerms; + } + + public List<String> getAttributes(String s) { + + List<String> result = new ArrayList<String>(); + + try { + IndexWord iw = dict.getIndexWord(POS.ADJECTIVE, s); + if (iw != null) { + Synset[] synsets = iw.getSenses(); + Word[] words = synsets[0].getWords(); + for (Word w : words) { + String c = w.getLemma(); + if (!c.equals(s) && !c.contains(" ") && result.size() < 4) { + result.add(c); + } + } + } + + } + catch (JWNLException e) { + e.printStackTrace(); + } + + return result; + } + + /** + * Funktion returns a List of Hypo and Hypernyms of a given string + * + * @param s Word for which you want to get Hypo and Hypersyms + * @return List of Hypo and Hypernyms + * @throws JWNLException + */ + public List<String> getRelatedNouns(String s) { + List<String> result = new ArrayList<String>(); + IndexWord word = null; + Synset sense = null; + try { + word = dict.getIndexWord(POS.NOUN, s); + if (word != null) { + sense = word.getSense(1); + //Synset sense = word.getSense(1); + + PointerTargetNodeList relatedListHypernyms = null; + PointerTargetNodeList relatedListHyponyms = null; + try { + relatedListHypernyms = PointerUtils.getInstance().getDirectHypernyms(sense); + } + catch (JWNLException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + try { + relatedListHyponyms = PointerUtils.getInstance().getDirectHyponyms(sense); + } + catch (JWNLException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + Iterator i = relatedListHypernyms.iterator(); + while (i.hasNext()) { + PointerTargetNode related = (PointerTargetNode) i.next(); + Synset s1 = related.getSynset(); + String tmp = (s1.toString()).replace(s1.getGloss(), ""); + tmp = tmp.replace(" -- ()]", ""); + tmp = tmp.replaceAll("[0-9]", ""); + tmp = tmp.replace("[Synset: [Offset: ", ""); + tmp = tmp.replace("] [POS: noun] Words: ", ""); + //its possible, that there is more than one word in a line from wordnet + String[] array_tmp = tmp.split(","); + for (String z : array_tmp) { + result.add(z.replace(" ", "")); + } + } + + Iterator j = relatedListHyponyms.iterator(); + while (j.hasNext()) { + PointerTargetNode related = (PointerTargetNode) j.next(); + Synset s1 = related.getSynset(); + String tmp = (s1.toString()).replace(s1.getGloss(), ""); + tmp = tmp.replace(" -- ()]", ""); + tmp = tmp.replaceAll("[0-9]", ""); + tmp = tmp.replace("[Synset: [Offset: ", ""); + tmp = tmp.replace("] [POS: noun] Words: ", ""); + //its possible, that there is more than one word in a line from wordnet + String[] array_tmp = tmp.split(","); + for (String z : array_tmp) { + result.add(z.replace(" ", "")); + } + } + } + } + catch (JWNLException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + + return result; + } + } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-09-05 13:59:47 UTC (rev 4091) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-09-06 10:01:53 UTC (rev 4092) @@ -83,6 +83,23 @@ } /** + * Returns an array of the lemmas of the top {@code n} synonyms for the given word. Only synonyms for the POS in + * {@link #RELEVANT_POS} are returned. + * + * @param word the word to retrieve synonyms for + * @param n the number of senses to get lemmas for + * @return synonyms for the given word + */ + public static String[] getTopSynonymsForWord(String word, int n) { + ArrayList<String> synonyms = new ArrayList<String>(); + + for (POS pos : RELEVANT_POS) { + synonyms.addAll(wn.getTopSynonyms(pos, word, n)); + } + return synonyms.toArray(new String[synonyms.size()]); + } + + /** * Returns the normalized form of the given word. This method is only able to work with single words! If there is an * error normalizing the given word, the word itself is returned. * This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-09-06 11:36:39
|
Revision: 4093 http://sourceforge.net/p/dl-learner/code/4093 Author: dfleischhacker Date: 2013-09-06 11:36:33 +0000 (Fri, 06 Sep 2013) Log Message: ----------- Extend ontology words by synonyms Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java 2013-09-06 10:01:53 UTC (rev 4092) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java 2013-09-06 11:36:33 UTC (rev 4093) @@ -93,7 +93,7 @@ // IndexWord iw = dict.getMorphologicalProcessor().lookupBaseForm(pos, s); if (iw != null) { Synset[] synsets = iw.getSenses(); - for (int i = 0; i < n; i++) { + for (int i = 0; i < Math.min(n, synsets.length); i++) { for (Word word : synsets[i].getWords()) { String c = word.getLemma(); if (!c.equals(s) && !c.contains(" ")) { Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-09-06 10:01:53 UTC (rev 4092) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-09-06 11:36:33 UTC (rev 4093) @@ -36,9 +36,13 @@ for (int i = 0; i < camelCase.length(); i++) { // we just ignore characters not matching the defined pattern char curChar = camelCase.charAt(i); - if (!Character.isLetter(curChar)) { + if (Character.isWhitespace(curChar)) { + sb.append(" "); continue; } + else if (!Character.isLetter(curChar)) { + continue; + } if (Character.isUpperCase(curChar)) { // found a new upper case letter resultingWords.add(sb.toString()); sb = new StringBuilder(); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-09-06 10:01:53 UTC (rev 4092) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-09-06 11:36:33 UTC (rev 4093) @@ -1,5 +1,6 @@ package org.dllearner.algorithms.isle.index; +import org.apache.commons.lang.StringUtils; import org.dllearner.algorithms.isle.textretrieval.EntityTextRetriever; import org.dllearner.core.owl.Entity; import org.dllearner.utilities.datastructures.PrefixTrie; @@ -11,28 +12,62 @@ PrefixTrie<Set<Entity>> trie; EntityTextRetriever entityTextRetriever; - + + /** + * Initialize the trie with strings from the provided ontology using a no-op name generator, i.e., only the + * actual ontology strings are added and no expansion is done. + * + * @param entityTextRetriever the text retriever to use + * @param ontology the ontology to get strings from + */ public SimpleEntityCandidatesTrie(EntityTextRetriever entityTextRetriever, OWLOntology ontology) { - this.entityTextRetriever = entityTextRetriever; - buildTrie(ontology); + this(entityTextRetriever, ontology, new DummyNameGenerator()); } + + /** + * Initialize the trie with strings from the provided ontology and use the given entity name generator + * for generating alternative words. + * + * @param entityTextRetriever the text retriever to use + * @param ontology the ontology to get strings from + * @param nameGenerator the name generator to use for generating alternative words + */ + public SimpleEntityCandidatesTrie(EntityTextRetriever entityTextRetriever, OWLOntology ontology, + NameGenerator nameGenerator) { + this.entityTextRetriever = entityTextRetriever; + buildTrie(ontology, nameGenerator); + } - public void buildTrie(OWLOntology ontology) { + public void buildTrie(OWLOntology ontology, NameGenerator nameGenerator) { this.trie = new PrefixTrie<Set<Entity>>(); Map<Entity, Set<String>> relevantText = entityTextRetriever.getRelevantText(ontology); for (Entity entity : relevantText.keySet()) { + for (String text : relevantText.get(entity)) { - addEntry(text, entity); - // Adds also composing words, e.g. for "has child", "has" and "child" are also added - if (text.contains(" ")) { - for (String subtext : text.split(" ")) { - addEntry(subtext, entity); - //System.out.println("trie.add("+subtext+","++")"); - } - } - } - } + text = StringUtils.join(LinguisticUtil.getWordsFromCamelCase(text), " "); + text = StringUtils.join(LinguisticUtil.getWordsFromUnderscored(text), " "); + if (text.trim().isEmpty()) { + continue; + } + addEntry(text, entity); + for (String alternativeText : nameGenerator.getAlternativeText(text)) { +// System.out.println("New alternative text for " + text + " --> " + alternativeText); + addEntry(alternativeText, entity); + } + // Adds also composing words, e.g. for "has child", "has" and "child" are also added + if (text.contains(" ")) { + for (String subtext : text.split(" ")) { + addEntry(subtext, entity); + for (String alternativeText : nameGenerator.getAlternativeText(subtext)) { +// System.out.println("New alternative text for " + subtext + " --> " + alternativeText); + addEntry(alternativeText, entity); + } + //System.out.println("trie.add("+subtext+","++")"); + } + } + } + } } @Override @@ -62,7 +97,7 @@ public String toString() { String output = ""; Map<String,Set<Entity>> trieMap = trie.toMap(); - List<String> termsList = new ArrayList(trieMap.keySet()); + List<String> termsList = new ArrayList<String>(trieMap.keySet()); Collections.sort(termsList); for (String key : termsList) { output += key + ":\n"; @@ -78,4 +113,68 @@ } + public static interface NameGenerator { + /** + * Returns a list of possible alternative words for the given word + * + * @param text the text to return alternative words for + * @return alternative words for given word + */ + List<String> getAlternativeText(String text); + } + + public static class DummyNameGenerator implements NameGenerator { + @Override + public List<String> getAlternativeText(String word) { + return Collections.singletonList(word); + } + } + + /** + * Generates alternative texts by using WordNet synonyms. + */ + public static class WordNetNameGenerator implements NameGenerator { + private int maxNumberOfSenses = 5; + + /** + * Sets up the generator for returning the lemmas of the top {@code maxNumberOfSenses} senses. + * @param maxNumberOfSenses the maximum number of senses to aggregate word lemmas from + */ + public WordNetNameGenerator(int maxNumberOfSenses) { + this.maxNumberOfSenses = maxNumberOfSenses; + } + + @Override + public List<String> getAlternativeText(String word) { + return Arrays.asList(LinguisticUtil.getTopSynonymsForWord(word, maxNumberOfSenses)); + } + } + + /** + * Generates alternative texts by using WordNet synonym and lemmatizing of the original words + */ + public static class LemmatizingWordNetNameGenerator implements NameGenerator { + private int maxNumberOfSenses = 5; + + /** + * Sets up the generator for returning the lemmas of the top {@code maxNumberOfSenses} senses. + * @param maxNumberOfSenses the maximum number of senses to aggregate word lemmas from + */ + public LemmatizingWordNetNameGenerator(int maxNumberOfSenses) { + this.maxNumberOfSenses = maxNumberOfSenses; + } + + @Override + public List<String> getAlternativeText(String word) { + ArrayList<String> res = new ArrayList<String>(); + res.add(LinguisticUtil.getNormalizedForm(word)); + + for (String w : LinguisticUtil + .getTopSynonymsForWord(LinguisticUtil.getNormalizedForm(word), maxNumberOfSenses)) { + res.add(w.replaceAll("_", " ")); + } + + return res; + } + } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-09-06 10:01:53 UTC (rev 4092) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-09-06 11:36:33 UTC (rev 4093) @@ -5,36 +5,36 @@ /** * Annotates a document using a prefix trie + * * @author Andre Melo - * */ public class TrieLinguisticAnnotator implements LinguisticAnnotator { - - EntityCandidatesTrie candidatesTrie; - - public TrieLinguisticAnnotator(EntityCandidatesTrie candidatesTrie) { - this.candidatesTrie = candidatesTrie; - } - - /** - * Generates annotation based on trie's longest matching strings - * @param document - * @return - */ - @Override - public Set<Annotation> annotate(Document document) { - String content = document.getContent(); - Set<Annotation> annotations = new HashSet<Annotation>(); - for (int i=0; i<content.length(); i++) { - String unparsed = content.substring(i); - String match = candidatesTrie.getLongestMatch(unparsed); - if (match!=null && !match.isEmpty()) { - Annotation annotation = new Annotation(document, i, match.length()); - annotations.add(annotation); - i += match.length()-1; - } - } - return annotations; - } + EntityCandidatesTrie candidatesTrie; + public TrieLinguisticAnnotator(EntityCandidatesTrie candidatesTrie) { + this.candidatesTrie = candidatesTrie; + } + + /** + * Generates annotation based on trie's longest matching strings + * + * @param document the document to get annotations for + * @return the set of annotation for the given document + */ + @Override + public Set<Annotation> annotate(Document document) { + String content = document.getContent(); + Set<Annotation> annotations = new HashSet<Annotation>(); + for (int i = 0; i < content.length(); i++) { + String unparsed = content.substring(i); + String match = candidatesTrie.getLongestMatch(unparsed); + if (match != null && !match.isEmpty()) { + Annotation annotation = new Annotation(document, i, match.length()); + annotations.add(annotation); + i += match.length() - 1; + } + } + return annotations; + } + } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java 2013-09-06 10:01:53 UTC (rev 4092) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java 2013-09-06 11:36:33 UTC (rev 4093) @@ -30,7 +30,8 @@ */ public SimpleSemanticIndex(OWLOntology ontology, SyntacticIndex syntacticIndex) { super(ontology); - SimpleEntityCandidatesTrie trie = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), ontology); + SimpleEntityCandidatesTrie trie = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), + ontology, new SimpleEntityCandidatesTrie.LemmatizingWordNetNameGenerator(5)); // trie.printTrie(); setSemanticAnnotator(new SemanticAnnotator( new SimpleWordSenseDisambiguation(ontology), This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-09-06 13:31:47
|
Revision: 4095 http://sourceforge.net/p/dl-learner/code/4095 Author: dfleischhacker Date: 2013-09-06 13:31:43 +0000 (Fri, 06 Sep 2013) Log Message: ----------- Make LinguisticUtil singleton Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-09-06 12:48:08 UTC (rev 4094) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-09-06 13:31:43 UTC (rev 4095) @@ -6,23 +6,25 @@ import org.dllearner.algorithms.isle.WordNet; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; /** * Provides shortcuts to commonly used linguistic operations * @author Daniel Fleischhacker */ public class LinguisticUtil { + private static LinguisticUtil instance; + private static final WordNet wn = new WordNet(); private static POS[] RELEVANT_POS = new POS[]{POS.NOUN, POS.VERB}; private static Lemmatizer lemmatizer; - static { - try { - lemmatizer = new DefaultLemmatizer(); + public static LinguisticUtil getInstance() { + if (instance == null) { + instance = new LinguisticUtil(); } - catch (Exception e) { - e.printStackTrace(); - } + return instance; } /** @@ -30,7 +32,7 @@ * @param camelCase the word containing camelcase to split * @return all words as camelcase contained in the given word */ - public static String[] getWordsFromCamelCase(String camelCase) { + public String[] getWordsFromCamelCase(String camelCase) { ArrayList<String> resultingWords = new ArrayList<String>(); StringBuilder sb = new StringBuilder(); for (int i = 0; i < camelCase.length(); i++) { @@ -66,7 +68,7 @@ * @param underScored word to split at underscores * @return words contained in given word */ - public static String[] getWordsFromUnderscored(String underScored) { + public String[] getWordsFromUnderscored(String underScored) { return underScored.split("_"); } @@ -77,7 +79,7 @@ * @param word the word to retrieve synonyms for * @return synonyms for the given word */ - public static String[] getSynonymsForWord(String word) { + public String[] getSynonymsForWord(String word) { ArrayList<String> synonyms = new ArrayList<String>(); for (POS pos : RELEVANT_POS) { @@ -94,7 +96,7 @@ * @param n the number of senses to get lemmas for * @return synonyms for the given word */ - public static String[] getTopSynonymsForWord(String word, int n) { + public String[] getTopSynonymsForWord(String word, int n) { ArrayList<String> synonyms = new ArrayList<String>(); for (POS pos : RELEVANT_POS) { @@ -104,30 +106,48 @@ } /** - * Returns the normalized form of the given word. This method is only able to work with single words! If there is an - * error normalizing the given word, the word itself is returned. + * Returns the normalized form of the given word. If the word contains spaces, each part separated by spaces is + * normalized independently and joined afterwards. If there is an error normalizing the given word, the word itself + * is returned. * * @param word the word to get normalized form for * @return normalized form of the word or the word itself on an error */ - public static String getNormalizedForm(String word) { - try { - if (lemmatizer == null) { - return word; + public String getNormalizedForm(String word) { + StringBuilder res = new StringBuilder(); + + boolean first = true; + + ArrayList<String> singleWords = new ArrayList<String>(); + Collections.addAll(singleWords, word.split(" ")); + + for (String w : singleWords) { + try { + if (first) { + first = false; + } + else { + res.append(" "); + } + if (lemmatizer == null) { + res.append(w); + } + else { + res.append(lemmatizer.lemmatize(w)); + } } - return lemmatizer.lemmatize(word); + catch (Exception e) { + e.printStackTrace(); + } } - catch (Exception e) { - e.printStackTrace(); - } - return word; + return res.toString(); } public static void main(String[] args) { - System.out.println(getNormalizedForm("going")); - for (String s : getWordsFromCamelCase("thisIsAClassWith1Name123")) { + System.out.println(LinguisticUtil.getInstance().getNormalizedForm("going")); + for (String s : LinguisticUtil.getInstance().getWordsFromCamelCase("thisIsAClassWith1Name123")) { System.out.println(s); - for (String w : getSynonymsForWord(s)) { + for (String w : LinguisticUtil.getInstance().getSynonymsForWord(s)) { System.out.println(" --> " + w); } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java 2013-09-06 12:48:08 UTC (rev 4094) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java 2013-09-06 13:31:43 UTC (rev 4095) @@ -42,7 +42,10 @@ Set<SemanticAnnotation> semanticAnnotations = new HashSet<SemanticAnnotation>(); for (Annotation annotation : annotations) { Set<Entity> candidateEntities = entityCandidateGenerator.getCandidates(annotation); - SemanticAnnotation semanticAnnotation = wordSenseDisambiguation.disambiguate(annotation, candidateEntities); + if (candidateEntities == null || candidateEntities.size() == 0) { + continue; + } + SemanticAnnotation semanticAnnotation = wordSenseDisambiguation.disambiguate(annotation, candidateEntities); if(semanticAnnotation != null){ semanticAnnotations.add(semanticAnnotation); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-09-06 12:48:08 UTC (rev 4094) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-09-06 13:31:43 UTC (rev 4095) @@ -45,14 +45,14 @@ for (Entity entity : relevantText.keySet()) { for (String text : relevantText.get(entity)) { - text = StringUtils.join(LinguisticUtil.getWordsFromCamelCase(text), " "); - text = StringUtils.join(LinguisticUtil.getWordsFromUnderscored(text), " "); + text = StringUtils.join(LinguisticUtil.getInstance().getWordsFromCamelCase(text), " "); + text = StringUtils.join(LinguisticUtil.getInstance().getWordsFromUnderscored(text), " "); if (text.trim().isEmpty()) { continue; } addEntry(text, entity); for (String alternativeText : nameGenerator.getAlternativeText(text)) { -// System.out.println("New alternative text for " + text + " --> " + alternativeText); + System.out.println("New alternative text for " + text + " --> " + alternativeText); addEntry(alternativeText, entity); } // Adds also composing words, e.g. for "has child", "has" and "child" are also added @@ -60,7 +60,7 @@ for (String subtext : text.split(" ")) { addEntry(subtext, entity); for (String alternativeText : nameGenerator.getAlternativeText(subtext)) { -// System.out.println("New alternative text for " + subtext + " --> " + alternativeText); + System.out.println("New alternative text for " + subtext + " --> " + alternativeText); addEntry(alternativeText, entity); } //System.out.println("trie.add("+subtext+","++")"); @@ -146,7 +146,7 @@ @Override public List<String> getAlternativeText(String word) { - return Arrays.asList(LinguisticUtil.getTopSynonymsForWord(word, maxNumberOfSenses)); + return Arrays.asList(LinguisticUtil.getInstance().getTopSynonymsForWord(word, maxNumberOfSenses)); } } @@ -167,10 +167,10 @@ @Override public List<String> getAlternativeText(String word) { ArrayList<String> res = new ArrayList<String>(); - res.add(LinguisticUtil.getNormalizedForm(word)); + res.add(LinguisticUtil.getInstance().getNormalizedForm(word)); - for (String w : LinguisticUtil - .getTopSynonymsForWord(LinguisticUtil.getNormalizedForm(word), maxNumberOfSenses)) { + for (String w : LinguisticUtil.getInstance() + .getTopSynonymsForWord(LinguisticUtil.getInstance().getNormalizedForm(word), maxNumberOfSenses)) { res.add(w.replaceAll("_", " ")); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-09-06 12:48:08 UTC (rev 4094) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-09-06 13:31:43 UTC (rev 4095) @@ -27,7 +27,7 @@ Set<Annotation> annotations = new HashSet<Annotation>(); for (int i = 0; i < content.length(); i++) { String unparsed = content.substring(i); - String match = candidatesTrie.getLongestMatch(unparsed); + String match = candidatesTrie.getLongestMatch(LinguisticUtil.getInstance().getNormalizedForm(unparsed)); if (match != null && !match.isEmpty()) { Annotation annotation = new Annotation(document, i, match.length()); annotations.add(annotation); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-09-06 12:48:08 UTC (rev 4094) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-09-06 13:31:43 UTC (rev 4095) @@ -91,8 +91,8 @@ if(textWithWeight.isEmpty() && useShortFormFallback){ String shortForm = sfp.getShortForm(IRI.create(entity.getURI())); - shortForm = Joiner.on(" ").join(LinguisticUtil.getWordsFromCamelCase(shortForm)); - shortForm = Joiner.on(" ").join(LinguisticUtil.getWordsFromUnderscored(shortForm)).trim(); + shortForm = Joiner.on(" ").join(LinguisticUtil.getInstance().getWordsFromCamelCase(shortForm)); + shortForm = Joiner.on(" ").join(LinguisticUtil.getInstance().getWordsFromUnderscored(shortForm)).trim(); textWithWeight.put(shortForm, weight); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-09-09 10:43:49
|
Revision: 4102 http://sourceforge.net/p/dl-learner/code/4102 Author: lorenz_b Date: 2013-09-09 10:43:46 +0000 (Mon, 09 Sep 2013) Log Message: ----------- Added class to compute the cosine similarity for 2 documents using the Lucene API. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/NLPHeuristic.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/VSMCosineDocumentSimilarity.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/NLPHeuristic.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/NLPHeuristic.java 2013-09-09 10:18:57 UTC (rev 4101) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/NLPHeuristic.java 2013-09-09 10:43:46 UTC (rev 4102) @@ -57,7 +57,7 @@ @ConfigOption(name = "startNodeBonus", defaultValue="0.1") private double startNodeBonus = 0.1; - private double nlpBonusFactor = 0.0001; + private double nlpBonusFactor = 1; private Map<Entity, Double> entityRelevance; Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/VSMCosineDocumentSimilarity.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/VSMCosineDocumentSimilarity.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/VSMCosineDocumentSimilarity.java 2013-09-09 10:43:46 UTC (rev 4102) @@ -0,0 +1,238 @@ +/** + * + */ +package org.dllearner.algorithms.isle; + +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.math3.linear.ArrayRealVector; +import org.apache.commons.math3.linear.RealVector; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.core.SimpleAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.Version; + +/** + * Imagine an N-dimensional space where N is the number of unique words in a pair of texts. Each of the two texts + * can be treated like a vector in this N-dimensional space. The distance between the two vectors is an indication + * of the similarity of the two texts. The cosine of the angle between the two vectors is the most common distance measure. + * @author Lorenz Buehmann + * + */ +public class VSMCosineDocumentSimilarity { + + enum TermWeighting { + TF, TF_IDF + } + + public static final String CONTENT = "Content"; + public static final FieldType TYPE_STORED = new FieldType(); + + private final Set<String> terms = new HashSet<String>(); + private final RealVector v1; + private final RealVector v2; + + static { + TYPE_STORED.setIndexed(true); + TYPE_STORED.setTokenized(true); + TYPE_STORED.setStored(true); + TYPE_STORED.setStoreTermVectors(true); + TYPE_STORED.setStoreTermVectorPositions(true); + TYPE_STORED.freeze(); + } + + public VSMCosineDocumentSimilarity(String s1, String s2, TermWeighting termWeighting) throws IOException { + //create the index + Directory directory = createIndex(s1, s2); + IndexReader reader = DirectoryReader.open(directory); + //generate the document vectors + if(termWeighting == TermWeighting.TF){//based on term frequency only + //compute the term frequencies for document 1 + Map<String, Integer> f1 = getTermFrequencies(reader, 0); + //compute the term frequencies for document 2 + Map<String, Integer> f2 = getTermFrequencies(reader, 1); + reader.close(); + //map both documents to vector objects + v1 = getTermVectorInteger(f1); + v2 = getTermVectorInteger(f2); + } else if(termWeighting == TermWeighting.TF_IDF){//based on tf*idf weighting + //compute the term frequencies for document 1 + Map<String, Double> f1 = getTermWeights(reader, 0); + //compute the term frequencies for document 2 + Map<String, Double> f2 = getTermWeights(reader, 1); + reader.close(); + //map both documents to vector objects + v1 = getTermVectorDouble(f1); + v2 = getTermVectorDouble(f2); + } else { + v1 = null; + v2 = null; + } + } + + public VSMCosineDocumentSimilarity(String s1, String s2) throws IOException { + this(s1, s2, TermWeighting.TF_IDF); + } + + /** + * Returns the cosine document similarity between document {@code doc1} and {@code doc2} using TF-IDF as weighting for each term. + * The resulting similarity ranges from -1 meaning exactly opposite, to 1 meaning exactly the same, + * with 0 usually indicating independence, and in-between values indicating intermediate similarity or dissimilarity. + * @param s1 + * @param s2 + * @return + * @throws IOException + */ + public static double getCosineSimilarity(String doc1, String doc2) + throws IOException { + return new VSMCosineDocumentSimilarity(doc1, doc2).getCosineSimilarity(); + } + + /** + * Returns the cosine document similarity between document {@code doc1} and {@code doc2} based on {@code termWeighting} to compute the weight + * for each term in the documents. + * The resulting similarity ranges from -1 meaning exactly opposite, to 1 meaning exactly the same, + * with 0 usually indicating independence, and in-between values indicating intermediate similarity or dissimilarity. + * @param s1 + * @param s2 + * @return + * @throws IOException + */ + public static double getCosineSimilarity(String doc1, String doc2, TermWeighting termWeighting) + throws IOException { + return new VSMCosineDocumentSimilarity(doc1, doc2, termWeighting).getCosineSimilarity(); + } + + /** + * Create a in-memory Lucene index for both documents. + * @param s1 + * @param s2 + * @return + * @throws IOException + */ + private Directory createIndex(String s1, String s2) throws IOException { + Directory directory = new RAMDirectory(); + Analyzer analyzer = new SimpleAnalyzer(Version.LUCENE_43); + IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer); + IndexWriter writer = new IndexWriter(directory, iwc); + addDocument(writer, s1); + addDocument(writer, s2); + writer.close(); + return directory; + } + + /** + * Add the document to the Lucene index. + * @param writer + * @param content + * @throws IOException + */ + private void addDocument(IndexWriter writer, String content) throws IOException { + Document doc = new Document(); + Field field = new Field(CONTENT, content, TYPE_STORED); + doc.add(field); + writer.addDocument(doc); + } + + /** + * Get the frequency of each term contained in the document. + * @param reader + * @param docId + * @return + * @throws IOException + */ + private Map<String, Integer> getTermFrequencies(IndexReader reader, int docId) + throws IOException { + Terms vector = reader.getTermVector(docId, CONTENT); + TermsEnum termsEnum = vector.iterator(null); + Map<String, Integer> frequencies = new HashMap<String, Integer>(); + BytesRef text = null; + while ((text = termsEnum.next()) != null) { + String term = text.utf8ToString(); + int freq = (int) termsEnum.totalTermFreq(); + frequencies.put(term, freq); + terms.add(term); + } + return frequencies; + } + + /** + * Get the weight(tf*idf) of each term contained in the document. + * @param reader + * @param docId + * @return + * @throws IOException + */ + private Map<String, Double> getTermWeights(IndexReader reader, int docId) + throws IOException { + Terms vector = reader.getTermVector(docId, CONTENT); + TermsEnum termsEnum = vector.iterator(null); + Map<String, Double> weights = new HashMap<String, Double>(); + BytesRef text = null; + while ((text = termsEnum.next()) != null) { + String term = text.utf8ToString(); + //get the term frequency + int tf = (int) termsEnum.totalTermFreq(); + //get the document frequency + int df = reader.docFreq(new Term(CONTENT, text)); + //compute the inverse document frequency + double idf = getIDF(reader.numDocs(), df); + //compute tf*idf + double weight = tf * idf; + + weights.put(term, weight); + terms.add(term); + } + return weights; + } + + private double getIDF(int totalNumberOfDocuments, int documentFrequency){ + return 1 + Math.log(totalNumberOfDocuments/documentFrequency); + } + + private double getCosineSimilarity() { + return (v1.dotProduct(v2)) / (v1.getNorm() * v2.getNorm()); + } + + private RealVector getTermVectorInteger(Map<String, Integer> map) { + RealVector vector = new ArrayRealVector(terms.size()); + int i = 0; + for (String term : terms) { + int value = map.containsKey(term) ? map.get(term) : 0; + vector.setEntry(i++, value); + } + return vector.mapDivide(vector.getL1Norm()); + } + + private RealVector getTermVectorDouble(Map<String, Double> map) { + RealVector vector = new ArrayRealVector(terms.size()); + int i = 0; + for (String term : terms) { + double value = map.containsKey(term) ? map.get(term) : 0d; + vector.setEntry(i++, value); + } + return vector.mapDivide(vector.getL1Norm()); + } + + public static void main(String[] args) throws Exception { + double cosineSimilarity = VSMCosineDocumentSimilarity.getCosineSimilarity("The king is here", "The salad is cold"); + System.out.println(cosineSimilarity); + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <and...@us...> - 2013-09-10 15:52:52
|
Revision: 4107 http://sourceforge.net/p/dl-learner/code/4107 Author: andremelo Date: 2013-09-10 15:52:48 +0000 (Tue, 10 Sep 2013) Log Message: ----------- - Adding the method to EntitityCandidateGenerator interface: HashMap<Annotation,Set<Entity>> getCandidatesMap(Set<Annotation> annotations) - Adding first version of the postprocessing from the trie implementation Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityCandidateGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidateGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityCandidateGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityCandidateGenerator.java 2013-09-10 15:49:18 UTC (rev 4106) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityCandidateGenerator.java 2013-09-10 15:52:48 UTC (rev 4107) @@ -3,6 +3,7 @@ */ package org.dllearner.algorithms.isle; +import java.util.HashMap; import java.util.Set; import org.dllearner.algorithms.isle.index.Annotation; @@ -22,4 +23,7 @@ } public abstract Set<Entity> getCandidates(Annotation annotation); + + + public abstract HashMap<Annotation,Set<Entity>> getCandidatesMap(Set<Annotation> annotations); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java 2013-09-10 15:49:18 UTC (rev 4106) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java 2013-09-10 15:52:48 UTC (rev 4107) @@ -1,5 +1,6 @@ package org.dllearner.algorithms.isle.index; +import java.util.HashMap; import java.util.HashSet; import java.util.Set; @@ -40,8 +41,9 @@ public AnnotatedDocument processDocument(TextDocument document){ Set<Annotation> annotations = linguisticAnnotator.annotate(document); Set<SemanticAnnotation> semanticAnnotations = new HashSet<SemanticAnnotation>(); - for (Annotation annotation : annotations) { - Set<Entity> candidateEntities = entityCandidateGenerator.getCandidates(annotation); + HashMap<Annotation,Set<Entity>> candidatesMap = entityCandidateGenerator.getCandidatesMap(annotations); + for (Annotation annotation : candidatesMap.keySet()) { + Set<Entity> candidateEntities = candidatesMap.get(annotation); if (candidateEntities == null || candidateEntities.size() == 0) { continue; } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidateGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidateGenerator.java 2013-09-10 15:49:18 UTC (rev 4106) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidateGenerator.java 2013-09-10 15:52:48 UTC (rev 4107) @@ -3,6 +3,7 @@ */ package org.dllearner.algorithms.isle.index; +import java.util.HashMap; import java.util.HashSet; import java.util.Set; @@ -39,4 +40,13 @@ return allEntities; } + @Override + public HashMap<Annotation, Set<Entity>> getCandidatesMap(Set<Annotation> annotations) { + HashMap<Annotation, Set<Entity>> result = new HashMap<Annotation, Set<Entity>>(); + for (Annotation annotation: annotations) + result.put(annotation, getCandidates(annotation)); + + return result; + } + } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-09-10 15:49:18 UTC (rev 4106) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-09-10 15:52:48 UTC (rev 4107) @@ -1,11 +1,24 @@ package org.dllearner.algorithms.isle.index; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; import java.util.Set; +import java.util.regex.Pattern; import org.dllearner.algorithms.isle.EntityCandidateGenerator; +import org.dllearner.algorithms.isle.StopWordFilter; import org.dllearner.core.owl.Entity; import org.semanticweb.owlapi.model.OWLOntology; +import cern.colt.Arrays; +import cern.colt.list.AbstractCollection; + +import edu.stanford.nlp.util.Sets; + /** * Generates candidates using a entity candidates prefix trie * @author Andre Melo @@ -13,7 +26,9 @@ */ public class TrieEntityCandidateGenerator extends EntityCandidateGenerator{ - EntityCandidatesTrie candidatesTrie; + final EntityCandidatesTrie candidatesTrie; + final StopWordFilter stopWordFilter = new StopWordFilter(); + int window = 10; public TrieEntityCandidateGenerator(OWLOntology ontology, EntityCandidatesTrie candidatesTrie) { super(ontology); @@ -24,4 +39,103 @@ return candidatesTrie.getCandidateEntities(annotation.getToken()); } + /** + * Postprocess the annotations generated by annotate + * The objective is to merge annotations which are likely to belong to the same entity + * @param annotations : set of annotations + * @param window : maximum distance between the annotations + * @return + */ + public void postProcess(HashMap<Annotation,Set<Entity>> candidatesMap, int window, StopWordFilter stopWordFilter) { + Set<Annotation> annotations = candidatesMap.keySet(); + List<Annotation> sortedAnnotations = new ArrayList<Annotation>(annotations); + + // Sort annotations by offset in ascending order + Collections.sort(sortedAnnotations, new Comparator<Annotation>(){ + public int compare(Annotation a1,Annotation a2){ + return Integer.compare(a1.getOffset(), a2.getOffset()); + } + }); + + int windowStart = 0; + int windowEnd = 0; + for (int i=0; i<sortedAnnotations.size(); i++) { + + Annotation annotation_i = sortedAnnotations.get(i); + int begin_i = annotation_i.getOffset(); + int end_i = begin_i + annotation_i.getLength()-1; + String token_i = annotation_i.getToken(); + Set<Entity> candidates_i = getCandidates(annotation_i); + Set<Entity> newCandidates_i = new HashSet<Entity>(); + + // Determine the annotations contained in the window + while ((sortedAnnotations.get(windowStart).getOffset()+sortedAnnotations.get(windowStart).getLength()-1)<(begin_i-window)) + windowStart++; + while (windowEnd<sortedAnnotations.size() && sortedAnnotations.get(windowEnd).getOffset()<(end_i+window)) + windowEnd++; + + // For every annotation in the window (defined by the number of characters between offsets) + for (int j=windowStart; j<sortedAnnotations.size() && j<windowEnd; j++) { + if (j!=i) { + Annotation annotation_j = sortedAnnotations.get(j); + String token_j = annotation_j.getToken(); + Set<Entity> candidates_j = getCandidates(annotation_j); + Set<Entity> intersection = Sets.intersection(candidates_i, candidates_j); + Set<Entity> newCandidates_ij = new HashSet<Entity>(); + for (Entity commonEntity: intersection) { + if (!(stopWordFilter.isStopWord(token_i) && stopWordFilter.isStopWord(token_j))) { + if (!token_i.contains(token_j) && !token_j.contains(token_i)) { + newCandidates_ij.add(commonEntity); + //System.out.println("common("+token_i+","+token_j+")="+commonEntity); + } + } + } + if (!newCandidates_ij.isEmpty()) { + Annotation mergedAnnotation = mergeAnnotations(annotation_i,annotation_j); + // If there's no punctuation in the merged annotation + if (!Pattern.matches("\\p{Punct}", mergedAnnotation.getToken())) { + candidatesMap.put(mergedAnnotation, newCandidates_ij); + candidatesMap.remove(annotation_i); + candidatesMap.remove(annotation_j); + } + + newCandidates_i.addAll(newCandidates_ij); + } + } + } + + // Deletes annotation if it's a stop word and doesn't have any matching annotation in the window + if (stopWordFilter.isStopWord(token_i)) { + if (newCandidates_i.isEmpty()) + candidatesMap.remove(annotation_i); + } + } + + + + } + + private Annotation mergeAnnotations(Annotation annotation_i, Annotation annotation_j) { + int offset; + int length; + if (annotation_i.getOffset() < annotation_j.getOffset()) { + offset = annotation_i.getOffset(); + length = annotation_j.getOffset() - offset + annotation_j.getLength(); + } else { + offset = annotation_j.getOffset(); + length = annotation_i.getOffset() - offset + annotation_i.getLength(); + } + return new Annotation(annotation_i.getReferencedDocument(), offset, length); + } + + @Override + public HashMap<Annotation, Set<Entity>> getCandidatesMap(Set<Annotation> annotations) { + HashMap<Annotation, Set<Entity>> candidatesMap = new HashMap<Annotation, Set<Entity>>(); + for (Annotation annotation: annotations) + candidatesMap.put(annotation, getCandidates(annotation)); + + postProcess(candidatesMap, window, stopWordFilter); + + return candidatesMap; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-10-29 14:20:42
|
Revision: 4134 http://sourceforge.net/p/dl-learner/code/4134 Author: lorenz_b Date: 2013-10-29 14:20:38 +0000 (Tue, 29 Oct 2013) Log Message: ----------- Remove owl:Thing Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WindowBasedContextExtractor.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-10-29 14:11:07 UTC (rev 4133) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-10-29 14:20:38 UTC (rev 4134) @@ -8,7 +8,6 @@ import java.util.Map; import java.util.Set; -import org.dllearner.algorithms.isle.index.LinguisticAnnotator; import org.dllearner.algorithms.isle.index.LinguisticUtil; import org.dllearner.core.owl.Entity; import org.dllearner.kb.OWLAPIOntology; @@ -16,6 +15,7 @@ import org.semanticweb.owlapi.model.IRI; import org.semanticweb.owlapi.model.OWLAnnotation; import org.semanticweb.owlapi.model.OWLAnnotationProperty; +import org.semanticweb.owlapi.model.OWLClass; import org.semanticweb.owlapi.model.OWLEntity; import org.semanticweb.owlapi.model.OWLLiteral; import org.semanticweb.owlapi.model.OWLOntology; @@ -23,6 +23,8 @@ import org.semanticweb.owlapi.util.IRIShortFormProvider; import org.semanticweb.owlapi.util.SimpleIRIShortFormProvider; +import uk.ac.manchester.cs.owl.owlapi.OWLDataFactoryImpl; + import com.google.common.base.Joiner; @@ -42,6 +44,8 @@ private IRIShortFormProvider sfp = new SimpleIRIShortFormProvider(); private OWLAnnotationProperty[] properties; + + private static final OWLClass OWL_THING = new OWLDataFactoryImpl().getOWLThing(); public AnnotationEntityTextRetriever(OWLOntology ontology, OWLAnnotationProperty... properties) { this.ontology = ontology; @@ -111,6 +115,7 @@ schemaEntities.addAll(ontology.getClassesInSignature()); schemaEntities.addAll(ontology.getObjectPropertiesInSignature()); schemaEntities.addAll(ontology.getDataPropertiesInSignature()); + schemaEntities.remove(OWL_THING); Map<String, Double> relevantText; for (OWLEntity owlEntity : schemaEntities) { Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WindowBasedContextExtractor.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WindowBasedContextExtractor.java 2013-10-29 14:11:07 UTC (rev 4133) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WindowBasedContextExtractor.java 2013-10-29 14:20:38 UTC (rev 4134) @@ -23,18 +23,32 @@ public class WindowBasedContextExtractor implements ContextExtractor{ private StanfordCoreNLP pipeline; + private int tokensLeft = 10; + private int tokensRight = 10; - /** - * - */ - public WindowBasedContextExtractor() { + public WindowBasedContextExtractor(int tokensLeft, int tokensRight) { + this.tokensLeft = tokensLeft; + this.tokensRight = tokensRight; + + Properties props = new Properties(); + props.put("annotators", "tokenize, ssplit"); + pipeline = new StanfordCoreNLP(props); + } + public WindowBasedContextExtractor(int tokensLeftRight) { + tokensLeft = tokensLeftRight; + tokensRight = tokensLeftRight; + Properties props = new Properties(); props.put("annotators", "tokenize, ssplit"); pipeline = new StanfordCoreNLP(props); - - } + + public WindowBasedContextExtractor() { + Properties props = new Properties(); + props.put("annotators", "tokenize, ssplit"); + pipeline = new StanfordCoreNLP(props); + } /* (non-Javadoc) * @see org.dllearner.algorithms.isle.wsd.ContextExtractor#extractContext(java.lang.String, java.lang.String) This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-10-29 14:59:43
|
Revision: 4137 http://sourceforge.net/p/dl-learner/code/4137 Author: lorenz_b Date: 2013-10-29 14:59:40 +0000 (Tue, 29 Oct 2013) Log Message: ----------- Fixed bug ins WSD. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/StructuralEntityContext.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/StructuralEntityContext.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/StructuralEntityContext.java 2013-10-29 14:51:29 UTC (rev 4136) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/StructuralEntityContext.java 2013-10-29 14:59:40 UTC (rev 4137) @@ -104,17 +104,7 @@ * @return */ public static Set<OWLEntity> getContext(OWLOntology ontology, Entity entity){ - - OWLEntity owlEntity = OWLAPIConverter.getOWLAPIEntity(entity); - if(owlEntity.isOWLClass()){ - return getContext(ontology, owlEntity.asOWLClass()); - } else if(owlEntity.isOWLObjectProperty()){ - return getContext(ontology, owlEntity.asOWLObjectProperty()); - } else if(owlEntity.isOWLDataProperty()){ - return getContext(ontology, owlEntity.asOWLDataProperty()); - } - - throw new UnsupportedOperationException("Unsupported entity type: " + entity); + return getContext(ontology, OWLAPIConverter.getOWLAPIEntity(entity)); } public static Set<OWLEntity> getContext(OWLOntology ontology, OWLObjectProperty property){ Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-10-29 14:51:29 UTC (rev 4136) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-10-29 14:59:40 UTC (rev 4137) @@ -25,7 +25,7 @@ this.candidatesTrie = candidatesTrie; } - public Set<Entity> getCandidates(Annotation annotation) { + public Set<Entity> getCandidates(Annotation annotation) {System.out.println(annotation); return candidatesTrie.getCandidateEntities(annotation.getMatchedString()); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-10-29 14:51:29 UTC (rev 4136) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-10-29 14:59:40 UTC (rev 4137) @@ -104,7 +104,7 @@ } /** - * Returns for each entity in the ontology all relevant text, i.e. eitherthe annotations or the short form of the IRI as fallback. + * Returns for each entity in the ontology all relevant text, i.e. either the annotations or the short form of the IRI as fallback. * @return */ @Override This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-11-21 11:54:26
|
Revision: 4155 http://sourceforge.net/p/dl-learner/code/4155 Author: lorenz_b Date: 2013-11-21 11:54:23 +0000 (Thu, 21 Nov 2013) Log Message: ----------- Added POS tags to text documents in constructor. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/StanfordPartOfSpeechTagger.java Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/StanfordPartOfSpeechTagger.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/StanfordPartOfSpeechTagger.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/StanfordPartOfSpeechTagger.java 2013-11-21 11:54:23 UTC (rev 4155) @@ -0,0 +1,59 @@ +package org.dllearner.algorithms.isle; + +import java.util.List; +import java.util.Properties; + +import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; +import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.pipeline.Annotation; +import edu.stanford.nlp.pipeline.StanfordCoreNLP; +import edu.stanford.nlp.util.CoreMap; + +public class StanfordPartOfSpeechTagger { + + private static StanfordPartOfSpeechTagger instance; + private StanfordCoreNLP pipeline; + + private StanfordPartOfSpeechTagger(){ + Properties props = new Properties(); + props.put("annotators", "tokenize, ssplit, pos"); + pipeline = new StanfordCoreNLP(props); + } + + public static synchronized StanfordPartOfSpeechTagger getInstance(){ + if(instance == null){ + instance = new StanfordPartOfSpeechTagger(); + } + return instance; + } + + public String tag(String text) { + String out = ""; + + // create an empty Annotation just with the given text + Annotation document = new Annotation(text); + + // run all Annotators on this text + pipeline.annotate(document); + + // these are all the sentences in this document + // a CoreMap is essentially a Map that uses class objects as keys and has values with custom types + List<CoreMap> sentences = document.get(SentencesAnnotation.class); + + for(CoreMap sentence: sentences) { + for (CoreLabel token: sentence.get(TokensAnnotation.class)) { + // this is the text of the token + String word = token.get(TextAnnotation.class); + // this is the POS tag of the token + String pos = token.get(PartOfSpeechAnnotation.class); + + out += " " + word + "/" + pos; + } + } + + return out.trim(); + } +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-11-21 11:39:19 UTC (rev 4154) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-11-21 11:54:23 UTC (rev 4155) @@ -1,5 +1,7 @@ package org.dllearner.algorithms.isle.index; +import org.dllearner.algorithms.isle.StanfordPartOfSpeechTagger; + /** * A simple text document without further formatting or markup. * @@ -10,7 +12,6 @@ private String rawContent; private String posTaggedContent; - /** * Initializes a text document with the given raw content. Internally, the content is cleaned up so that it only * contains letters adhering to the regular expression pattern [A-Za-z]. @@ -19,26 +20,24 @@ */ public TextDocument(String content) { this.rawContent = content; - this.content = content.toLowerCase(); - this.content = this.content.replaceAll("[^a-z ]", " "); - this.content = this.content.replaceAll("\\s{2,}", " "); - this.content = this.content.trim(); + + //build cleaned content + buildCleanedContent(); + + //build POS tagged content + buildPOSTaggedContent(); } - /** - * Initializes a text document with the given raw content. Internally, the content is cleaned up so that it only - * contains letters adhering to the regular expression pattern [A-Za-z]. - * - * @param content the raw content of this text document - */ - public TextDocument(String content, String posTaggedContent) { - this.rawContent = content; - this.posTaggedContent = posTaggedContent; - this.content = content.toLowerCase(); + private void buildCleanedContent(){ + this.content = content.toLowerCase(); this.content = this.content.replaceAll("[^a-z ]", " "); this.content = this.content.replaceAll("\\s{2,}", " "); this.content = this.content.trim(); } + + private void buildPOSTaggedContent(){ + this.posTaggedContent = StanfordPartOfSpeechTagger.getInstance().tag(rawContent); + } @Override public String getContent() { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-11-21 12:57:14
|
Revision: 4161 http://sourceforge.net/p/dl-learner/code/4161 Author: lorenz_b Date: 2013-11-21 12:57:10 +0000 (Thu, 21 Nov 2013) Log Message: ----------- Cont. text document generator. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-11-21 12:51:05 UTC (rev 4160) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-11-21 12:57:10 UTC (rev 4161) @@ -19,7 +19,10 @@ public class TextDocumentGenerator { private static TextDocumentGenerator instance; + private StanfordCoreNLP pipeline; + private final String punctuationPattern = "\\p{Punct}"; + private final StopWordFilter stopWordFilter = new StopWordFilter(); private TextDocumentGenerator(){ Properties props = new Properties(); @@ -54,14 +57,22 @@ String pos = label.get(PartOfSpeechAnnotation.class); //this is the POS tag of the token String lemma = label.get(LemmaAnnotation.class); + //check if token is punctuation + boolean isPunctuation = word.matches(punctuationPattern); + //check if it is a stop word + boolean isStopWord = stopWordFilter.isStopWord(word); - Token token = new Token(word); - token.setPOSTag(pos); - token.setStemmedForm(lemma); + Token token = new Token(word, lemma, pos, isPunctuation, isStopWord); + document.add(token); } } return document; } + + public static void main(String[] args) throws Exception { + TextDocument document = TextDocumentGenerator.getInstance().tag("And he said, Amos, what seest thou? And I said, A basket of summer fruit. Then said the LORD unto me, The end is come upon my people of Israel; I will not again pass by them any more. "); + System.out.println(document); + } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-11-21 12:51:05 UTC (rev 4160) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-11-21 12:57:10 UTC (rev 4161) @@ -12,11 +12,21 @@ private String rawForm; private String stemmedForm; private String posTag; + private boolean isPunctuation; + private boolean isStopWord; public Token(String rawForm) { - posTag = rawForm; + this.rawForm = rawForm; } + public Token(String rawForm, String stemmedForm, String posTag, boolean isPunctuation, boolean isStopWord) { + this.rawForm = rawForm; + this.stemmedForm = stemmedForm; + this.posTag = posTag; + this.isPunctuation = isPunctuation; + this.isStopWord = isStopWord; + } + /** * @return the rawForm */ @@ -39,6 +49,20 @@ } /** + * @return the isPunctuation + */ + public boolean isPunctuation() { + return isPunctuation; + } + + /** + * @return the isStopWord + */ + public boolean isStopWord() { + return isStopWord; + } + + /** * @param stemmedForm the stemmedForm to set */ public void setStemmedForm(String stemmedForm) { @@ -51,14 +75,28 @@ public void setPOSTag(String posTag) { this.posTag = posTag; } + + /** + * @param isPunctuation the isPunctuation to set + */ + public void setIsPunctuation(boolean isPunctuation) { + this.isPunctuation = isPunctuation; + } + + /** + * @param isStopWord the isStopWord to set + */ + public void setIsStopWord(boolean isStopWord) { + this.isStopWord = isStopWord; + } /* (non-Javadoc) * @see java.lang.Object#toString() */ @Override public String toString() { - return "Word: " + rawForm + "\n" + return "\n[Word: " + rawForm + "\n" + "Stemmed word: " + stemmedForm + "\n" - + "POS tag: " + posTag; + + "POS tag: " + posTag + "]"; } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-11-21 13:16:16
|
Revision: 4163 http://sourceforge.net/p/dl-learner/code/4163 Author: dfleischhacker Date: 2013-11-21 13:16:13 +0000 (Thu, 21 Nov 2013) Log Message: ----------- Annotation refactoring Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SimpleWordSenseDisambiguation.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java 2013-11-21 13:00:33 UTC (rev 4162) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java 2013-11-21 13:16:13 UTC (rev 4163) @@ -4,6 +4,9 @@ package org.dllearner.algorithms.isle.index; +import java.util.ArrayList; +import java.util.List; + /** * A (non-semantic) annotation which represents an entity in a document by its offset and length. * @author Lorenz Buehmann @@ -12,8 +15,7 @@ public class Annotation { private Document referencedDocument; - private int offset; - private int length; + private ArrayList<Token> tokens; private String matchedString; public String getMatchedString() { @@ -24,64 +26,64 @@ this.matchedString = matchedString; } - public Annotation(Document referencedDocument, int offset, int length) { + public Annotation(Document referencedDocument, List<Token> tokens) { this.referencedDocument = referencedDocument; - this.offset = offset; - this.length = length; - } + this.tokens = new ArrayList<Token>(tokens); + } public Document getReferencedDocument() { return referencedDocument; } - public int getOffset() { - return offset; - } + public String getString(){ + StringBuilder sb = new StringBuilder(); + for (Token t : tokens) { + if (sb.length() > 0) { + sb.append(" "); + } + sb.append(t.getStemmedForm()); + } + return sb.toString(); + } - public int getLength() { - return length; - } - - public String getToken(){ - return referencedDocument.getContent().substring(offset, offset + length); - } + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } - @Override - public int hashCode() { - final int prime = 31; - int result = 1; - result = prime * result + ((referencedDocument == null) ? 0 : referencedDocument.hashCode()); - result = prime * result + length; - result = prime * result + offset; - return result; - } + Annotation that = (Annotation) o; + if (matchedString != null ? !matchedString.equals(that.matchedString) : that.matchedString != null) { + return false; + } + if (referencedDocument != null ? !referencedDocument.equals(that.referencedDocument) : + that.referencedDocument != null) { + return false; + } + if (tokens != null ? !tokens.equals(that.tokens) : that.tokens != null) { + return false; + } + + return true; + } + + @Override + public int hashCode() { + int result = referencedDocument != null ? referencedDocument.hashCode() : 0; + result = 31 * result + (tokens != null ? tokens.hashCode() : 0); + result = 31 * result + (matchedString != null ? matchedString.hashCode() : 0); + return result; + } + + /* (non-Javadoc) + * @see java.lang.Object#toString() + */ @Override - public boolean equals(Object obj) { - if (this == obj) - return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) - return false; - Annotation other = (Annotation) obj; - if (referencedDocument == null) { - if (other.referencedDocument != null) - return false; - } else if (!referencedDocument.equals(other.referencedDocument)) - return false; - if (length != other.length) - return false; - if (offset != other.offset) - return false; - return true; - } - - /* (non-Javadoc) - * @see java.lang.Object#toString() - */ - @Override public String toString() { - return "\"" + referencedDocument.getContent().substring(offset, offset+length) + "\" at position " + offset; - } + return getString(); + } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-11-21 13:00:33 UTC (rev 4162) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-11-21 13:16:13 UTC (rev 4163) @@ -54,7 +54,7 @@ Annotation annotation_i = sortedAnnotations.get(i); int begin_i = annotation_i.getOffset(); int end_i = begin_i + annotation_i.getLength()-1; - String token_i = annotation_i.getToken(); + String token_i = annotation_i.getString(); Set<Entity> candidates_i = getCandidates(annotation_i); Set<Entity> newCandidates_i = new HashSet<Entity>(); @@ -68,7 +68,7 @@ for (int j=windowStart; j<sortedAnnotations.size() && j<windowEnd; j++) { if (j!=i) { Annotation annotation_j = sortedAnnotations.get(j); - String token_j = annotation_j.getToken(); + String token_j = annotation_j.getString(); Set<Entity> candidates_j = getCandidates(annotation_j); Set<Entity> intersection = Sets.intersection(candidates_i, candidates_j); Set<Entity> newCandidates_ij = new HashSet<Entity>(); @@ -83,7 +83,7 @@ if (!newCandidates_ij.isEmpty()) { Annotation mergedAnnotation = mergeAnnotations(annotation_i,annotation_j); // If there's no punctuation in the merged annotation - if (!Pattern.matches("\\p{Punct}", mergedAnnotation.getToken())) { + if (!Pattern.matches("\\p{Punct}", mergedAnnotation.getString())) { candidatesMap.put(mergedAnnotation, newCandidates_ij); candidatesMap.remove(annotation_i); candidatesMap.remove(annotation_j); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-11-21 13:00:33 UTC (rev 4162) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-11-21 13:16:13 UTC (rev 4163) @@ -48,6 +48,7 @@ public SemanticIndex(OWLOntology ontology) { this.ontology = ontology; + } /** Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java 2013-11-21 13:00:33 UTC (rev 4162) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java 2013-11-21 13:16:13 UTC (rev 4163) @@ -57,7 +57,7 @@ } index += s.length(); } - throw new RuntimeException("Token " + annotation.getToken() + " not found in text " + annotation.getReferencedDocument().getRawContent()); + throw new RuntimeException("Token " + annotation.getString() + " not found in text " + annotation.getReferencedDocument().getRawContent()); } private List<CoreMap> getSentences(String document) { Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SimpleWordSenseDisambiguation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SimpleWordSenseDisambiguation.java 2013-11-21 13:00:33 UTC (rev 4162) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SimpleWordSenseDisambiguation.java 2013-11-21 13:16:13 UTC (rev 4163) @@ -50,7 +50,7 @@ public SemanticAnnotation disambiguate(Annotation annotation, Set<Entity> candidateEntities) { logger.debug("Linguistic annotations:\n" + annotation); logger.debug("Candidate entities:" + candidateEntities); - String token = annotation.getToken().trim(); + String token = annotation.getString().trim(); //check if annotation token matches label of entity or the part behind #(resp. /) for (Entity entity : candidateEntities) { Set<String> labels = getLabels(entity); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-11-21 13:38:25
|
Revision: 4164 http://sourceforge.net/p/dl-learner/code/4164 Author: lorenz_b Date: 2013-11-21 13:38:03 +0000 (Thu, 21 Nov 2013) Log Message: ----------- Refactored context extractors. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WindowBasedContextExtractor.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-11-21 13:16:13 UTC (rev 4163) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-11-21 13:38:03 UTC (rev 4164) @@ -37,7 +37,7 @@ return instance; } - public TextDocument tag(String text) { + public TextDocument generateDocument(String text) { TextDocument document = new TextDocument(); // create an empty Annotation just with the given text Annotation annotatedDocument = new Annotation(text); @@ -72,7 +72,7 @@ } public static void main(String[] args) throws Exception { - TextDocument document = TextDocumentGenerator.getInstance().tag("And he said, Amos, what seest thou? And I said, A basket of summer fruit. Then said the LORD unto me, The end is come upon my people of Israel; I will not again pass by them any more. "); + TextDocument document = TextDocumentGenerator.getInstance().generateDocument("And he said, Amos, what seest thou? And I said, A basket of summer fruit. Then said the LORD unto me, The end is come upon my people of Israel; I will not again pass by them any more. "); System.out.println(document); } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java 2013-11-21 13:16:13 UTC (rev 4163) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java 2013-11-21 13:38:03 UTC (rev 4164) @@ -34,6 +34,13 @@ public Document getReferencedDocument() { return referencedDocument; } + + /** + * @return the tokens + */ + public ArrayList<Token> getTokens() { + return tokens; + } public String getString(){ StringBuilder sb = new StringBuilder(); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java 2013-11-21 13:16:13 UTC (rev 4163) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java 2013-11-21 13:38:03 UTC (rev 4164) @@ -16,6 +16,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; +import org.dllearner.algorithms.isle.TextDocumentGenerator; import org.dllearner.algorithms.isle.index.TextDocument; import java.io.File; @@ -61,7 +62,7 @@ ScoreDoc[] result = searcher.search(query, getSize()).scoreDocs; for (int i = 0; i < result.length; i++) { Document doc = searcher.doc(result[i].doc); - documents.add(new TextDocument(doc.get(searchField))); + documents.add(TextDocumentGenerator.getInstance().generateDocument(doc.get(searchField))); } } catch (ParseException e) { e.printStackTrace(); @@ -85,7 +86,7 @@ try { Document doc = indexReader.document(i); String content = doc.get(searchField); - documents.add(new TextDocument(content)); + documents.add(TextDocumentGenerator.getInstance().generateDocument(content)); } catch (IOException e) { e.printStackTrace(); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java 2013-11-21 13:16:13 UTC (rev 4163) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SentenceBasedContextExtractor.java 2013-11-21 13:38:03 UTC (rev 4164) @@ -3,6 +3,14 @@ */ package org.dllearner.algorithms.isle.wsd; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Properties; + +import org.dllearner.algorithms.isle.TextDocumentGenerator; +import org.dllearner.algorithms.isle.index.Token; + import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; @@ -10,12 +18,7 @@ import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.util.CoreMap; -import org.dllearner.algorithms.isle.index.TextDocument; -import java.util.ArrayList; -import java.util.List; -import java.util.Properties; - /** * @author Lorenz Buehmann * @@ -36,26 +39,29 @@ @Override public List<String> extractContext(org.dllearner.algorithms.isle.index.Annotation annotation) { //split text into sentences - List<CoreMap> sentences = getSentences(annotation.getReferencedDocument().getContent()); + List<CoreMap> sentences = getSentences(annotation.getReferencedDocument().getRawContent()); //find the sentence containing the token of the annotation - int tokenStart = annotation.getOffset(); - int index = 0; + Token firstToken = annotation.getTokens().get(0); for (CoreMap sentence : sentences) { - String s = sentence.toString(); - if (index <= tokenStart && s.length() > tokenStart) { + boolean found = false; + for (CoreLabel label : sentence.get(TokensAnnotation.class)) { + // this is the text of the token + String word = label.get(TextAnnotation.class); + if(word.equals(firstToken.getRawForm())){ + found = true; + break; + } + } + if(found){ List<String> context = new ArrayList<String>(); for (CoreLabel label : sentence.get(TokensAnnotation.class)) { // this is the text of the token String word = label.get(TextAnnotation.class); - - if(!word.isEmpty() && !word.matches("\\p{Punct}")){ - context.add(word); - } + context.add(word); } return context; } - index += s.length(); } throw new RuntimeException("Token " + annotation.getString() + " not found in text " + annotation.getReferencedDocument().getRawContent()); } @@ -79,9 +85,8 @@ String s = "International Business Machines Corporation, or IBM, is an American multinational services technology and consulting corporation, with headquarters in Armonk, New York, United States. IBM manufactures and markets computer hardware and software," + " and offers infrastructure, hosting and consulting services in areas ranging from mainframe computers to nanotechnology."; - String token = "services"; SentenceBasedContextExtractor extractor = new SentenceBasedContextExtractor(); - List<String> context = extractor.extractContext(new org.dllearner.algorithms.isle.index.Annotation(new TextDocument(s), s.indexOf(token), token.length())); + List<String> context = extractor.extractContext(new org.dllearner.algorithms.isle.index.Annotation(TextDocumentGenerator.getInstance().generateDocument(s), Arrays.asList(new Token("American")))); System.out.println(context); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WindowBasedContextExtractor.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WindowBasedContextExtractor.java 2013-11-21 13:16:13 UTC (rev 4163) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WindowBasedContextExtractor.java 2013-11-21 13:38:03 UTC (rev 4164) @@ -3,6 +3,14 @@ */ package org.dllearner.algorithms.isle.wsd; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Properties; + +import org.dllearner.algorithms.isle.TextDocumentGenerator; +import org.dllearner.algorithms.isle.index.Token; + import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; @@ -10,18 +18,13 @@ import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.util.CoreMap; -import org.dllearner.algorithms.isle.index.TextDocument; -import java.util.ArrayList; -import java.util.List; -import java.util.Properties; - /** * @author Lorenz Buehmann - * + * */ -public class WindowBasedContextExtractor implements ContextExtractor{ - +public class WindowBasedContextExtractor implements ContextExtractor { + private StanfordCoreNLP pipeline; private int tokensLeft = 10; private int tokensRight = 10; @@ -29,57 +32,66 @@ public WindowBasedContextExtractor(int tokensLeft, int tokensRight) { this.tokensLeft = tokensLeft; this.tokensRight = tokensRight; - + Properties props = new Properties(); props.put("annotators", "tokenize, ssplit"); pipeline = new StanfordCoreNLP(props); } - + public WindowBasedContextExtractor(int tokensLeftRight) { tokensLeft = tokensLeftRight; tokensRight = tokensLeftRight; - + Properties props = new Properties(); props.put("annotators", "tokenize, ssplit"); pipeline = new StanfordCoreNLP(props); } - + public WindowBasedContextExtractor() { Properties props = new Properties(); props.put("annotators", "tokenize, ssplit"); pipeline = new StanfordCoreNLP(props); } - /* (non-Javadoc) - * @see org.dllearner.algorithms.isle.wsd.ContextExtractor#extractContext(java.lang.String, java.lang.String) + /* + * (non-Javadoc) + * + * @see + * org.dllearner.algorithms.isle.wsd.ContextExtractor#extractContext(java + * .lang.String, java.lang.String) */ @Override public List<String> extractContext(org.dllearner.algorithms.isle.index.Annotation annotation) { // split text into sentences - List<CoreMap> sentences = getSentences(annotation.getReferencedDocument().getContent()); + List<CoreMap> sentences = getSentences(annotation.getReferencedDocument().getRawContent()); // find the sentence containing the token of the annotation - int tokenStart = annotation.getOffset(); - int index = 0; + Token firstToken = annotation.getTokens().get(0); for (CoreMap sentence : sentences) { - String s = sentence.toString(); - if (index <= tokenStart && s.length() > tokenStart) { + boolean found = false; + for (CoreLabel label : sentence.get(TokensAnnotation.class)) { + // this is the text of the token + String word = label.get(TextAnnotation.class); + if (word.equals(firstToken.getRawForm())) { + found = true; + break; + } + } + if (found) { List<String> context = new ArrayList<String>(); for (CoreLabel label : sentence.get(TokensAnnotation.class)) { // this is the text of the token String word = label.get(TextAnnotation.class); - context.add(word); } return context; } - index += s.length(); } - throw new RuntimeException("Token " + annotation + " not found in text " - + annotation.getReferencedDocument().getContent()); + throw new RuntimeException("Token " + annotation.getString() + " not found in text " + + annotation.getReferencedDocument().getRawContent()); } - + private List<CoreMap> getSentences(String document) { // create an empty Annotation just with the given text Annotation annotation = new Annotation(document); @@ -94,14 +106,14 @@ return sentences; } - + public static void main(String[] args) throws Exception { String s = "International Business Machines Corporation, or IBM, is an American multinational services technology and consulting corporation, with headquarters in Armonk, New York, United States. IBM manufactures and markets computer hardware and software," + " and offers infrastructure, hosting and consulting services in areas ranging from mainframe computers to nanotechnology."; - + String token = "services"; WindowBasedContextExtractor extractor = new WindowBasedContextExtractor(); - List<String> context = extractor.extractContext(new org.dllearner.algorithms.isle.index.Annotation(new TextDocument(s), s.indexOf(token), token.length())); + List<String> context = extractor.extractContext(new org.dllearner.algorithms.isle.index.Annotation(TextDocumentGenerator.getInstance().generateDocument(s), Arrays.asList(new Token("American")))); System.out.println(context); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-12-02 14:52:35
|
Revision: 4183 http://sourceforge.net/p/dl-learner/code/4183 Author: lorenz_b Date: 2013-12-02 14:52:33 +0000 (Mon, 02 Dec 2013) Log Message: ----------- Refactoring. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/EntityTextRetriever.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/RDFSLabelEntityTextRetriever.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-02 14:41:21 UTC (rev 4182) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-02 14:52:33 UTC (rev 4183) @@ -42,26 +42,21 @@ public void buildTrie(OWLOntology ontology, NameGenerator nameGenerator) { this.trie = new PrefixTrie<FullTokenEntitySetPair>(); - Map<Entity, Set<String>> relevantText = entityTextRetriever.getRelevantText(ontology); + Map<Entity, Set<List<Token>>> entity2TokenSet = entityTextRetriever.getRelevantText(ontology); - for (Entity entity : relevantText.keySet()) { - - for (String text : relevantText.get(entity)) { - text = StringUtils.join(LinguisticUtil.getInstance().getWordsFromCamelCase(text), " "); - text = StringUtils.join(LinguisticUtil.getInstance().getWordsFromUnderscored(text), " "); - if (text.trim().isEmpty()) { - continue; - } - - addEntry(text, entity); - addSubsequencesWordNet(entity, text); - - for (String alternativeText : nameGenerator.getAlternativeText(text)) { - addEntry(alternativeText.toLowerCase(), entity, text); - } - } - } + for (Entry<Entity, Set<List<Token>>> entry : entity2TokenSet.entrySet()) { + Entity entity = entry.getKey(); + Set<List<Token>> tokenSet = entry.getValue(); + for (List<Token> tokens : tokenSet) { + addEntry(tokens, entity); + addSubsequences(entity, tokens); +// addSubsequencesWordNet(entity, text); +// for (String alternativeText : nameGenerator.getAlternativeText(text)) { +// addEntry(alternativeText.toLowerCase(), entity, text); +// } + } + } } /** Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-02 14:41:21 UTC (rev 4182) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-02 14:52:33 UTC (rev 4183) @@ -127,6 +127,15 @@ tree.add(tokens1, new NamedClass("TokenTree")); tree.add(tokens2, new NamedClass("TokenizedTree")); System.out.println(tree); + + System.out.println(tree.getEntitiesForLongestMatch(tokens1)); + System.out.println(tree.getLongestMatch(tokens1)); + + List<Token> tokens3 = Lists.newLinkedList(); + for (String s : Splitter.on(" ").split("this is a very nice tokenized tree")) { + tokens3.add(new Token(s, s, s, false, false)); + }; + System.out.println(tree.getLongestMatch(tokens3)); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-12-02 14:41:21 UTC (rev 4182) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-12-02 14:52:33 UTC (rev 4183) @@ -5,10 +5,13 @@ import java.util.HashMap; import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Set; +import org.dllearner.algorithms.isle.TextDocumentGenerator; import org.dllearner.algorithms.isle.index.LinguisticUtil; +import org.dllearner.algorithms.isle.index.Token; import org.dllearner.core.owl.Entity; import org.dllearner.kb.OWLAPIOntology; import org.dllearner.utilities.owl.OWLAPIConverter; @@ -75,8 +78,8 @@ * @see org.dllearner.algorithms.isle.EntityTextRetriever#getRelevantText(org.dllearner.core.owl.Entity) */ @Override - public Map<String, Double> getRelevantText(Entity entity) { - Map<String, Double> textWithWeight = new HashMap<String, Double>(); + public Map<List<Token>, Double> getRelevantText(Entity entity) { + Map<List<Token>, Double> textWithWeight = new HashMap<List<Token>, Double>(); OWLEntity e = OWLAPIConverter.getOWLAPIEntity(entity); @@ -87,7 +90,7 @@ OWLLiteral val = (OWLLiteral) annotation.getValue(); if (val.hasLang(language)) { String label = val.getLiteral().trim(); - textWithWeight.put(label, weight); + textWithWeight.put(TextDocumentGenerator.getInstance().generateDocument(label), weight); } } } @@ -97,7 +100,7 @@ String shortForm = sfp.getShortForm(IRI.create(entity.getURI())); shortForm = Joiner.on(" ").join(LinguisticUtil.getInstance().getWordsFromCamelCase(shortForm)); shortForm = Joiner.on(" ").join(LinguisticUtil.getInstance().getWordsFromUnderscored(shortForm)).trim(); - textWithWeight.put(shortForm, weight); + textWithWeight.put(TextDocumentGenerator.getInstance().generateDocument(shortForm), weight); } return textWithWeight; @@ -108,8 +111,8 @@ * @return */ @Override - public Map<Entity, Set<String>> getRelevantText(OWLOntology ontology) { - Map<Entity, Set<String>> entity2RelevantText = new HashMap<Entity, Set<String>>(); + public Map<Entity, Set<List<Token>>> getRelevantText(OWLOntology ontology) { + Map<Entity, Set<List<Token>>> entity2RelevantText = new HashMap<>(); Set<OWLEntity> schemaEntities = new HashSet<OWLEntity>(); schemaEntities.addAll(ontology.getClassesInSignature()); @@ -117,7 +120,7 @@ schemaEntities.addAll(ontology.getDataPropertiesInSignature()); schemaEntities.remove(OWL_THING); - Map<String, Double> relevantText; + Map<List<Token>, Double> relevantText; for (OWLEntity owlEntity : schemaEntities) { Entity entity = OWLAPIConverter.getEntity(owlEntity); relevantText = getRelevantText(entity); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/EntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/EntityTextRetriever.java 2013-12-02 14:41:21 UTC (rev 4182) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/EntityTextRetriever.java 2013-12-02 14:52:33 UTC (rev 4183) @@ -19,9 +19,11 @@ package org.dllearner.algorithms.isle.textretrieval; +import java.util.List; import java.util.Map; import java.util.Set; +import org.dllearner.algorithms.isle.index.Token; import org.dllearner.core.owl.Entity; import org.semanticweb.owlapi.model.OWLOntology; @@ -45,8 +47,8 @@ * @param entity The entity to handle. * @return A weighted set of strings. For a value x, we need to have 0 <= x <= 1. */ - public Map<String, Double> getRelevantText(Entity entity); + public Map<List<Token>, Double> getRelevantText(Entity entity); - public Map<Entity, Set<String>> getRelevantText(OWLOntology ontology); + public Map<Entity, Set<List<Token>>> getRelevantText(OWLOntology ontology); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/RDFSLabelEntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/RDFSLabelEntityTextRetriever.java 2013-12-02 14:41:21 UTC (rev 4182) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/RDFSLabelEntityTextRetriever.java 2013-12-02 14:52:33 UTC (rev 4183) @@ -4,12 +4,14 @@ package org.dllearner.algorithms.isle.textretrieval; import java.io.File; +import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; +import org.dllearner.algorithms.isle.index.Token; import org.dllearner.core.owl.Entity; import org.dllearner.kb.OWLAPIOntology; import org.semanticweb.owlapi.apibinding.OWLManager; @@ -43,13 +45,13 @@ OWLOntology ontology = man.loadOntology(IRI.create("http://www.semanticbible.com/2006/11/NTNames.owl")); RDFSLabelEntityTextRetriever labelRetriever = new RDFSLabelEntityTextRetriever(ontology); - Map<Entity, Set<String>> relevantText = labelRetriever.getRelevantText(ontology); + Map<Entity, Set<List<Token>>> relevantText = labelRetriever.getRelevantText(ontology); SortedMap<String, String> uri2Labels = new TreeMap<String, String>(); - for (Entry<Entity, Set<String>> entry : relevantText.entrySet()) { + for (Entry<Entity, Set<List<Token>>> entry : relevantText.entrySet()) { Entity key = entry.getKey(); - Set<String> value = entry.getValue(); - uri2Labels.put(key.getName(), value.iterator().next()); + Set<List<Token>> value = entry.getValue(); + uri2Labels.put(key.getName(), value.iterator().next().get(0).getRawForm()); } StringBuilder csv = new StringBuilder(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-12-09 14:22:22
|
Revision: 4193 http://sourceforge.net/p/dl-learner/code/4193 Author: lorenz_b Date: 2013-12-09 14:22:20 +0000 (Mon, 09 Dec 2013) Log Message: ----------- Added generator class for semantic index. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndexGenerator.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-12-03 12:41:34 UTC (rev 4192) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-12-09 14:22:20 UTC (rev 4193) @@ -14,6 +14,9 @@ import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; +import edu.stanford.nlp.trees.CollinsHeadFinder; +import edu.stanford.nlp.trees.Tree; +import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation; import edu.stanford.nlp.util.CoreMap; public class TextDocumentGenerator { @@ -26,7 +29,7 @@ private TextDocumentGenerator(){ Properties props = new Properties(); - props.put("annotators", "tokenize, ssplit, pos, lemma"); + props.put("annotators", "tokenize, ssplit, pos, lemma, parse"); pipeline = new StanfordCoreNLP(props); } @@ -58,12 +61,21 @@ //this is the POS tag of the token String lemma = label.get(LemmaAnnotation.class); //check if token is punctuation - boolean isPunctuation = word.matches(punctuationPattern); + boolean isPunctuation = word.matches(punctuationPattern) + || pos.equalsIgnoreCase("-lrb-") + || pos.equalsIgnoreCase("-rrb-") + || word.startsWith("'") + ; //check if it is a stop word - boolean isStopWord = stopWordFilter.isStopWord(word); + boolean isStopWord = stopWordFilter.isStopWord(word.toLowerCase()); Token token = new Token(word, lemma, pos, isPunctuation, isStopWord); - + + //determine the head noun + Tree tree = sentence.get(TreeAnnotation.class); + CollinsHeadFinder headFinder = new CollinsHeadFinder(); + Tree head = headFinder.determineHead(tree); + document.add(token); } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-12-03 12:41:34 UTC (rev 4192) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-12-09 14:22:20 UTC (rev 4193) @@ -4,6 +4,8 @@ import java.util.LinkedList; import java.util.List; +import org.dllearner.algorithms.isle.TextDocumentGenerator; + /** * A simple text document without further formatting or markup. * @@ -11,13 +13,10 @@ */ public class TextDocument extends LinkedList<Token> implements Document { public static void main(String[] args) { - TextDocument t = new TextDocument(); String s = "This is a very long, nice text for testing our new implementation of TextDocument."; - for (String e : s.split(" ")) { - t.add(new Token(e)); - } + TextDocument doc = TextDocumentGenerator.getInstance().generateDocument(s); - System.out.println(t.getRawContent()); + System.out.println(doc.getRawContent()); } @Override Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-12-03 12:41:34 UTC (rev 4192) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-12-09 14:22:20 UTC (rev 4193) @@ -12,6 +12,8 @@ public class TrieLinguisticAnnotator implements LinguisticAnnotator { EntityCandidatesTrie candidatesTrie; private boolean normalizeWords = true; + + private boolean ignoreStopWords = true; public TrieLinguisticAnnotator(EntityCandidatesTrie candidatesTrie) { this.candidatesTrie = candidatesTrie; @@ -30,11 +32,13 @@ List<Token> matchedTokens; for (Token token : document) { - matchedTokens = candidatesTrie.getLongestMatchingText(document.getTokensStartingAtToken(token, true)); - if(matchedTokens != null && !matchedTokens.isEmpty()){ - Annotation annotation = new Annotation(document, matchedTokens); - annotations.add(annotation); - } + if(!(token.isPunctuation() ||token.isStopWord())){ + matchedTokens = candidatesTrie.getLongestMatchingText(document.getTokensStartingAtToken(token, true)); + if(matchedTokens != null && !matchedTokens.isEmpty()){ + Annotation annotation = new Annotation(document, matchedTokens); + annotations.add(annotation); + } + } } return annotations; } Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndexGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndexGenerator.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndexGenerator.java 2013-12-09 14:22:20 UTC (rev 4193) @@ -0,0 +1,163 @@ +package org.dllearner.algorithms.isle.index.semantic; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.util.HashSet; +import java.util.Set; + +import org.apache.log4j.Logger; +import org.dllearner.algorithms.isle.EntityCandidateGenerator; +import org.dllearner.algorithms.isle.TextDocumentGenerator; +import org.dllearner.algorithms.isle.index.AnnotatedDocument; +import org.dllearner.algorithms.isle.index.LinguisticAnnotator; +import org.dllearner.algorithms.isle.index.SemanticAnnotator; +import org.dllearner.algorithms.isle.index.SimpleEntityCandidatesTrie; +import org.dllearner.algorithms.isle.index.TextDocument; +import org.dllearner.algorithms.isle.index.TrieEntityCandidateGenerator; +import org.dllearner.algorithms.isle.index.TrieLinguisticAnnotator; +import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever; +import org.dllearner.algorithms.isle.wsd.StructureBasedWordSenseDisambiguation; +import org.dllearner.algorithms.isle.wsd.WindowBasedContextExtractor; +import org.dllearner.algorithms.isle.wsd.WordSenseDisambiguation; +import org.dllearner.core.owl.Entity; +import org.semanticweb.owlapi.model.OWLAnnotation; +import org.semanticweb.owlapi.model.OWLAnnotationProperty; +import org.semanticweb.owlapi.model.OWLEntity; +import org.semanticweb.owlapi.model.OWLLiteral; +import org.semanticweb.owlapi.model.OWLOntology; + +import com.google.common.hash.HashCode; +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; + +/** + * Interface for an index which is able to resolve a given entity's URI to the set of documents containing + * this entity, i.e., documents which contain words disambiguated to the given entity. + * + * @author Lorenz Buehmann + * @author Daniel Fleischhacker + */ +public abstract class SemanticIndexGenerator { + + static HashFunction hf = Hashing.md5(); + private static final Logger logger = Logger.getLogger(SemanticIndexGenerator.class.getName()); + private static boolean useCache = false; + + public static SemanticIndex generateIndex(Set<String> documents, OWLOntology ontology, WordSenseDisambiguation wordSenseDisambiguation, + EntityCandidateGenerator entityCandidateGenerator, LinguisticAnnotator linguisticAnnotator){ + SemanticAnnotator semanticAnnotator = new SemanticAnnotator(wordSenseDisambiguation, entityCandidateGenerator, linguisticAnnotator); + return generateIndex(documents, ontology, semanticAnnotator); + } + + public static SemanticIndex generateIndex(Set<String> documents, OWLOntology ontology, SemanticAnnotator semanticAnnotator){ + SemanticIndex semanticIndex; + //try to load serialized version + HashCode hc = hf.newHasher().putInt(documents.hashCode()).putInt(ontology.hashCode()).hash(); + File file = new File(hc.toString() + ".ser"); + if(useCache && file.exists()){ + try { + logger.info("Loading semantic index from disk..."); + ObjectInputStream ois = new ObjectInputStream(new FileInputStream(file)); + semanticIndex = (SemanticIndex) ois.readObject(); + ois.close(); + logger.info("...done."); + } catch (Exception e) { + e.printStackTrace(); + semanticIndex = buildIndex(semanticAnnotator, documents); + } + } else { + logger.info("Building semantic index..."); + semanticIndex = buildIndex(semanticAnnotator, documents); + try { + ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(file)); + oos.writeObject(semanticIndex); + oos.close(); + } catch (IOException e1) { + e1.printStackTrace(); + } + logger.info("...done."); + } + return semanticIndex; + } + + public static SemanticIndex generateIndex(Set<String> documents, OWLOntology ontology, boolean useWordNormalization){ + SimpleEntityCandidatesTrie trie; + if (useWordNormalization) { + trie = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), + ontology, new SimpleEntityCandidatesTrie.LemmatizingWordNetNameGenerator(5)); + } + else { + trie = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), + ontology, new SimpleEntityCandidatesTrie.DummyNameGenerator()); + } + trie.printTrie(); + + TrieLinguisticAnnotator linguisticAnnotator = new TrieLinguisticAnnotator(trie); + linguisticAnnotator.setNormalizeWords(useWordNormalization); + + SemanticAnnotator semanticAnnotator = new SemanticAnnotator( + new StructureBasedWordSenseDisambiguation(new WindowBasedContextExtractor(), ontology), + new TrieEntityCandidateGenerator(ontology, trie), + linguisticAnnotator); + return generateIndex(documents, ontology, semanticAnnotator); + } + + public static SemanticIndex generateIndex(OWLOntology ontology, OWLAnnotationProperty annotationProperty, String language, boolean useWordNormalization){ + Set<OWLEntity> schemaEntities = new HashSet<OWLEntity>(); + schemaEntities.addAll(ontology.getClassesInSignature()); + schemaEntities.addAll(ontology.getObjectPropertiesInSignature()); + schemaEntities.addAll(ontology.getDataPropertiesInSignature()); + Set<String> documents = new HashSet<String>(); + for (OWLEntity entity : schemaEntities) { + String label = null; + Set<OWLAnnotation> annotations = entity.getAnnotations(ontology, annotationProperty); + for (OWLAnnotation annotation : annotations) { + if (annotation.getValue() instanceof OWLLiteral) { + OWLLiteral val = (OWLLiteral) annotation.getValue(); + if (language != null) { + if (val.hasLang(language)) { + label = val.getLiteral(); + } + } + else { + label = val.getLiteral(); + } + } + } + if (label != null) { + documents.add(label); + } + } + return generateIndex(documents, ontology, useWordNormalization); + } + + /** + * Precompute the whole index, i.e. iterate over all entities and compute all annotated documents. + */ + private static SemanticIndex buildIndex(SemanticAnnotator semanticAnnotator, Set<String> documents) { + logger.info("Creating semantic index..."); + SemanticIndex index = new SemanticIndex(); + for (String document : documents) { + TextDocument textDocument = TextDocumentGenerator.getInstance().generateDocument(document); + logger.debug("Processing document:" + textDocument); + AnnotatedDocument annotatedDocument = semanticAnnotator.processDocument(textDocument); + for (Entity entity : annotatedDocument.getContainedEntities()) { + Set<AnnotatedDocument> existingAnnotatedDocuments = index.get(entity); + if (existingAnnotatedDocuments == null) { + existingAnnotatedDocuments = new HashSet<AnnotatedDocument>(); + index.put(entity, existingAnnotatedDocuments); + } + existingAnnotatedDocuments.add(annotatedDocument); + } + logger.debug("Annotated document:" + annotatedDocument); + } + int size = documents.size(); + index.setTotalNrOfDocuments(size); + logger.info("...done."); + return index; + } +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-12-03 12:41:34 UTC (rev 4192) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-12-09 14:22:20 UTC (rev 4193) @@ -90,10 +90,13 @@ if (annotation.getValue() instanceof OWLLiteral) { OWLLiteral val = (OWLLiteral) annotation.getValue(); if (val.hasLang(language)) { + //trim String label = val.getLiteral().trim(); if(entity instanceof NamedClass){ label = label.toLowerCase(); } + //remove content in brackets like (...) + label = label.replaceAll("\\s?\\((.*?)\\)", ""); textWithWeight.put(TextDocumentGenerator.getInstance().generateDocument(label), weight); } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-12-09 15:34:19
|
Revision: 4196 http://sourceforge.net/p/dl-learner/code/4196 Author: lorenz_b Date: 2013-12-09 15:34:15 +0000 (Mon, 09 Dec 2013) Log Message: ----------- Added syntactic index. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/OWLOntologyLuceneSyntacticIndexCreator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/RDFSLabelEntityTextRetriever.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Index.java Removed Paths: ------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SyntacticIndex.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-12-09 14:40:04 UTC (rev 4195) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-12-09 15:34:15 UTC (rev 4196) @@ -17,6 +17,8 @@ import edu.stanford.nlp.trees.CollinsHeadFinder; import edu.stanford.nlp.trees.Tree; import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation; +import edu.stanford.nlp.trees.tregex.TregexMatcher; +import edu.stanford.nlp.trees.tregex.TregexPattern; import edu.stanford.nlp.util.CoreMap; public class TextDocumentGenerator { @@ -41,6 +43,10 @@ } public TextDocument generateDocument(String text) { + return generateDocument(text, false); + } + + public TextDocument generateDocument(String text, boolean determineHead) { TextDocument document = new TextDocument(); // create an empty Annotation just with the given text Annotation annotatedDocument = new Annotation(text); @@ -53,6 +59,33 @@ List<CoreMap> sentences = annotatedDocument.get(SentencesAnnotation.class); for(CoreMap sentence: sentences) { + + //determine the head noun + String head = null; + if(determineHead){ + //if phrase only contains one single token, the task is trivial + if(sentence.get(TokensAnnotation.class).size() == 1){ + head = sentence.get(TokensAnnotation.class).get(0).get(TextAnnotation.class); + } else { + Tree tree = sentence.get(TreeAnnotation.class); + CollinsHeadFinder headFinder = new CollinsHeadFinder(); +// Tree head = headFinder.determineHead(tree); +// System.out.println(sentence); +// System.out.println(tree.headTerminal(headFinder)); + head = tree.headTerminal(headFinder).toString(); + + // Create a reusable pattern object + TregexPattern patternMW = TregexPattern.compile("__ >># NP"); + // Run the pattern on one particular tree + TregexMatcher matcher = patternMW.matcher(tree); + // Iterate over all of the subtrees that matched + while (matcher.findNextMatchingNode()) { + Tree match = matcher.getMatch(); + // do what we want to with the subtree + } + } + } + for (CoreLabel label: sentence.get(TokensAnnotation.class)) { // this is the text of the token String word = label.get(TextAnnotation.class); @@ -71,10 +104,9 @@ Token token = new Token(word, lemma, pos, isPunctuation, isStopWord); - //determine the head noun - Tree tree = sentence.get(TreeAnnotation.class); - CollinsHeadFinder headFinder = new CollinsHeadFinder(); - Tree head = headFinder.determineHead(tree); + if(determineHead && word.equals(head)){ + token.setIsHead(true); + } document.add(token); } Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Index.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Index.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Index.java 2013-12-09 15:34:15 UTC (rev 4196) @@ -0,0 +1,31 @@ +/** + * + */ +package org.dllearner.algorithms.isle.index; + +import java.util.Set; + +import org.dllearner.core.owl.Entity; + +/** + * @author Lorenz Buehmann + * + */ +public interface Index { + + /** + * Returns a set of documents based on how the underlying index is processing the given + * search string. + * + * @param searchString query specifying the documents to retrieve + * @return set of documents retrieved based on the given query string + */ + Set<AnnotatedDocument> getDocuments(Entity entity); + + /** + * Returns the total number of documents contained in the index. + * + * @return the total number of documents contained in the index + */ + int getTotalNumberOfDocuments(); +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-09 14:40:04 UTC (rev 4195) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-09 15:34:15 UTC (rev 4196) @@ -21,6 +21,7 @@ private String posTag; private boolean isPunctuation; private boolean isStopWord; + private boolean isHead; /// for storing alternative forms of this token, e.g., generated by WordNet synonyms private HashSet<String> alternativeForms; @@ -36,7 +37,7 @@ this.isStopWord = isStopWord; this.alternativeForms = new HashSet<>(); } - + /** * @return the rawForm */ @@ -117,6 +118,20 @@ public void setIsStopWord(boolean isStopWord) { this.isStopWord = isStopWord; } + + /** + * @param wheteher the token is the head of the containg sequence of tokens + */ + public void setIsHead(boolean isHead) { + this.isHead = isHead; + } + + /** + * @return the isHead + */ + public boolean isHead() { + return isHead; + } /* (non-Javadoc) * @see java.lang.Object#toString() Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java 2013-12-09 14:40:04 UTC (rev 4195) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java 2013-12-09 15:34:15 UTC (rev 4196) @@ -3,6 +3,15 @@ */ package org.dllearner.algorithms.isle.index.syntactic; +import java.io.File; +import java.io.IOException; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; @@ -12,71 +21,88 @@ import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.TotalHitCountCollector; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.dllearner.algorithms.isle.TextDocumentGenerator; +import org.dllearner.algorithms.isle.index.AnnotatedDocument; +import org.dllearner.algorithms.isle.index.AnnotatedTextDocument; +import org.dllearner.algorithms.isle.index.Index; import org.dllearner.algorithms.isle.index.TextDocument; +import org.dllearner.algorithms.isle.index.Token; +import org.dllearner.algorithms.isle.textretrieval.AnnotationEntityTextRetriever; +import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever; +import org.dllearner.core.owl.Entity; +import org.semanticweb.owlapi.model.OWLOntology; -import java.io.File; -import java.io.IOException; -import java.util.HashSet; -import java.util.Set; - /** * @author Lorenz Buehmann * */ -public class LuceneSyntacticIndex implements SyntacticIndex { +public class LuceneSyntacticIndex implements Index { private IndexSearcher searcher; private QueryParser parser; private IndexReader indexReader; private String searchField; + + AnnotationEntityTextRetriever textRetriever; - public LuceneSyntacticIndex(IndexReader indexReader, String searchField) throws Exception { + public LuceneSyntacticIndex(OWLOntology ontology, IndexReader indexReader, String searchField) throws Exception { this.indexReader = indexReader; this.searchField = searchField; searcher = new IndexSearcher(indexReader); StandardAnalyzer analyzer = new StandardAnalyzer( Version.LUCENE_43); parser = new QueryParser( Version.LUCENE_43, searchField, analyzer ); + + textRetriever = new RDFSLabelEntityTextRetriever(ontology); } - public LuceneSyntacticIndex(Directory directory, String searchField) throws Exception { - this(DirectoryReader.open(directory), searchField); + public LuceneSyntacticIndex(OWLOntology ontology, Directory directory, String searchField) throws Exception { + this(ontology, DirectoryReader.open(directory), searchField); } - public LuceneSyntacticIndex(String indexDirectory, String searchField) throws Exception { - this(DirectoryReader.open(FSDirectory.open(new File(indexDirectory))), searchField); + public LuceneSyntacticIndex(OWLOntology ontology, String indexDirectory, String searchField) throws Exception { + this(ontology, DirectoryReader.open(FSDirectory.open(new File(indexDirectory))), searchField); } /* (non-Javadoc) * @see org.dllearner.algorithms.isle.SyntacticIndex#getDocuments(java.lang.String) */ @Override - public Set<org.dllearner.algorithms.isle.index.Document> getDocuments(String searchString) { - Set<org.dllearner.algorithms.isle.index.Document> documents = new HashSet<org.dllearner.algorithms.isle.index.Document>(); - try { - Query query = parser.parse(searchString); - ScoreDoc[] result = searcher.search(query, getSize()).scoreDocs; - for (int i = 0; i < result.length; i++) { - Document doc = searcher.doc(result[i].doc); - documents.add(TextDocumentGenerator.getInstance().generateDocument(doc.get(searchField))); + public Set<AnnotatedDocument> getDocuments(Entity entity) { + Set<AnnotatedDocument> documents = new HashSet<AnnotatedDocument>(); + + Map<List<Token>, Double> relevantText = textRetriever.getRelevantText(entity); + + for (Entry<List<Token>, Double> entry : relevantText.entrySet()) { + List<Token> tokens = entry.getKey(); + for (Token token : tokens) { + try { + Query query = parser.parse(token.getRawForm()); + ScoreDoc[] result = searcher.search(query, getTotalNumberOfDocuments()).scoreDocs; + for (int i = 0; i < result.length; i++) { + Document doc = searcher.doc(result[i].doc); + documents.add(new AnnotatedTextDocument( + TextDocumentGenerator.getInstance().generateDocument(doc.get(searchField)), + Collections.EMPTY_SET)); + } + } catch (ParseException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } } - } catch (ParseException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); } + return documents; } /* (non-Javadoc) - * @see org.dllearner.algorithms.isle.SyntacticIndex#getSize() + * @see org.dllearner.algorithms.isle.index.Index#getTotalNumberOfDocuments() */ @Override - public int getSize() { + public int getTotalNumberOfDocuments() { return indexReader.numDocs(); } @@ -94,22 +120,5 @@ return documents; } - /* (non-Javadoc) - * @see org.dllearner.algorithms.isle.SyntacticIndex#count(java.lang.String) - */ - @Override - public int count(String searchString) { - try { - Query query = parser.parse(searchString); - TotalHitCountCollector results = new TotalHitCountCollector(); - searcher.search(query, results); - return results.getTotalHits(); - } catch (ParseException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); - } - return -1; - } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/OWLOntologyLuceneSyntacticIndexCreator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/OWLOntologyLuceneSyntacticIndexCreator.java 2013-12-09 14:40:04 UTC (rev 4195) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/OWLOntologyLuceneSyntacticIndexCreator.java 2013-12-09 15:34:15 UTC (rev 4196) @@ -3,6 +3,10 @@ */ package org.dllearner.algorithms.isle.index.syntactic; +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Field; @@ -14,14 +18,17 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; -import org.semanticweb.owlapi.model.*; +import org.dllearner.algorithms.isle.index.Index; +import org.semanticweb.owlapi.model.OWLAnnotation; +import org.semanticweb.owlapi.model.OWLAnnotationProperty; +import org.semanticweb.owlapi.model.OWLDataFactory; +import org.semanticweb.owlapi.model.OWLEntity; +import org.semanticweb.owlapi.model.OWLLiteral; +import org.semanticweb.owlapi.model.OWLOntology; import org.semanticweb.owlapi.vocab.OWLRDFVocabulary; + import uk.ac.manchester.cs.owl.owlapi.OWLDataFactoryImpl; -import java.io.IOException; -import java.util.HashSet; -import java.util.Set; - /** * Creates a Lucene Index for the labels if classes and properties. * @author Lorenz Buehmann @@ -49,7 +56,7 @@ schemaEntities.addAll(ontology.getDataPropertiesInSignature()); } - public SyntacticIndex buildIndex() throws Exception{ + public Index buildIndex() throws Exception{ Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer); IndexWriter writer = new IndexWriter(directory, indexWriterConfig); Deleted: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SyntacticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SyntacticIndex.java 2013-12-09 14:40:04 UTC (rev 4195) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SyntacticIndex.java 2013-12-09 15:34:15 UTC (rev 4196) @@ -1,43 +0,0 @@ -/** - * - */ -package org.dllearner.algorithms.isle.index.syntactic; - -import org.dllearner.algorithms.isle.index.Document; - -import java.util.Set; - -/** - * Interface for a syntactic index, e.g., a basic string-based inverted index. - * - * @author Lorenz Buehmann - * @author Daniel Fleischhacker - */ -public interface SyntacticIndex { - - /** - * Returns a set of documents based on how the underlying index is processing the given - * search string. - * - * @param searchString query specifying the documents to retrieve - * @return set of documents retrieved based on the given query string - */ - Set<Document> getDocuments(String searchString); - - /** - * Returns the number of documents based on how the underlying index is processing the - * given search string. - * - * @param searchString query specifying the documents to include in the number of documents - * @return number of documents retrieved based on the given query string - */ - int count(String searchString); - - /** - * Returns the total number of documents contained in the index. - * - * @return the total number of documents contained in the index - */ - int getSize(); - -} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java 2013-12-09 14:40:04 UTC (rev 4195) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java 2013-12-09 15:34:15 UTC (rev 4196) @@ -3,6 +3,13 @@ */ package org.dllearner.algorithms.isle.index.syntactic; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Field; @@ -14,15 +21,9 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.util.Version; +import org.dllearner.algorithms.isle.index.Index; import org.dllearner.algorithms.isle.index.TextDocument; -import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.util.HashSet; -import java.util.Set; - /** * Creates a syntactic index from text files stored on disk * @@ -39,7 +40,7 @@ this.inputDirectory = inputDirectory; } - public SyntacticIndex buildIndex() throws Exception{ + public Index buildIndex() throws Exception{ Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer); IndexWriter writer = new IndexWriter(indexDirectory, indexWriterConfig); @@ -79,7 +80,7 @@ return new LuceneSyntacticIndex(indexDirectory, searchField); } - public SyntacticIndex buildIndex(Set<TextDocument> documents) throws Exception{ + public Index buildIndex(Set<TextDocument> documents) throws Exception{ Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer); IndexWriter writer = new IndexWriter(indexDirectory, indexWriterConfig); @@ -106,7 +107,7 @@ return new LuceneSyntacticIndex(indexDirectory, searchField); } - public static SyntacticIndex loadIndex(File indexDirectory) throws Exception { + public static Index loadIndex(File indexDirectory) throws Exception { return new LuceneSyntacticIndex(new SimpleFSDirectory(indexDirectory), searchField); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-12-09 14:40:04 UTC (rev 4195) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/AnnotationEntityTextRetriever.java 2013-12-09 15:34:15 UTC (rev 4196) @@ -46,6 +46,7 @@ private boolean useShortFormFallback = true; private IRIShortFormProvider sfp = new SimpleIRIShortFormProvider(); + protected boolean determineHeadNoun = false; private OWLAnnotationProperty[] properties; @@ -97,7 +98,7 @@ } //remove content in brackets like (...) label = label.replaceAll("\\s?\\((.*?)\\)", ""); - textWithWeight.put(TextDocumentGenerator.getInstance().generateDocument(label), weight); + textWithWeight.put(TextDocumentGenerator.getInstance().generateDocument(label, determineHeadNoun), weight); } } } @@ -107,7 +108,7 @@ String shortForm = sfp.getShortForm(IRI.create(entity.getURI())); shortForm = Joiner.on(" ").join(LinguisticUtil.getInstance().getWordsFromCamelCase(shortForm)); shortForm = Joiner.on(" ").join(LinguisticUtil.getInstance().getWordsFromUnderscored(shortForm)).trim(); - textWithWeight.put(TextDocumentGenerator.getInstance().generateDocument(shortForm), weight); + textWithWeight.put(TextDocumentGenerator.getInstance().generateDocument(shortForm, determineHeadNoun), weight); } return textWithWeight; Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/RDFSLabelEntityTextRetriever.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/RDFSLabelEntityTextRetriever.java 2013-12-09 14:40:04 UTC (rev 4195) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/textretrieval/RDFSLabelEntityTextRetriever.java 2013-12-09 15:34:15 UTC (rev 4196) @@ -34,10 +34,12 @@ public RDFSLabelEntityTextRetriever(OWLOntology ontology) { super(ontology, new OWLDataFactoryImpl().getOWLAnnotationProperty(OWLRDFVocabulary.RDFS_LABEL.getIRI())); + determineHeadNoun = true; } public RDFSLabelEntityTextRetriever(OWLAPIOntology ontology) { super(ontology, new OWLDataFactoryImpl().getOWLAnnotationProperty(OWLRDFVocabulary.RDFS_LABEL.getIRI())); + determineHeadNoun = true; } public static void main(String[] args) throws Exception { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-12-10 13:16:50
|
Revision: 4203 http://sourceforge.net/p/dl-learner/code/4203 Author: lorenz_b Date: 2013-12-10 13:16:47 +0000 (Tue, 10 Dec 2013) Log Message: ----------- Added SOLR based synatctic index. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/NLPHeuristic.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Index.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/AbstractRelevanceMetric.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/NTriplesFileLuceneSyntacticIndexCreator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SolrSyntacticIndex.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/NLPHeuristic.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/NLPHeuristic.java 2013-12-10 12:52:52 UTC (rev 4202) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/NLPHeuristic.java 2013-12-10 13:16:47 UTC (rev 4203) @@ -112,14 +112,14 @@ // OWLClassExpression owlapiDescription = OWLAPIConverter.getOWLAPIDescription(expression); // Set<Entity> entities = OWLAPIConverter.getEntities(owlapiDescription.getSignature()); Set<Entity> entities = expression.getSignature(); - double sum = 0; - for (Entity entity : entities) { - double relevance = entityRelevance.containsKey(entity) ? entityRelevance.get(entity) : 0;//System.out.println(entity + ":" + relevance); - if(!Double.isInfinite(relevance)){ - sum += relevance; - } - } - score += nlpBonusFactor * sum; +// double sum = 0; +// for (Entity entity : entities) { +// double relevance = entityRelevance.containsKey(entity) ? entityRelevance.get(entity) : 0;//System.out.println(entity + ":" + relevance); +// if(!Double.isInfinite(relevance)){ +// sum += relevance; +// } +// } +// score += nlpBonusFactor * sum; return score; } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-12-10 12:52:52 UTC (rev 4202) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/TextDocumentGenerator.java 2013-12-10 13:16:47 UTC (rev 4203) @@ -74,15 +74,15 @@ // System.out.println(tree.headTerminal(headFinder)); head = tree.headTerminal(headFinder).toString(); - // Create a reusable pattern object - TregexPattern patternMW = TregexPattern.compile("__ >># NP"); - // Run the pattern on one particular tree - TregexMatcher matcher = patternMW.matcher(tree); - // Iterate over all of the subtrees that matched - while (matcher.findNextMatchingNode()) { - Tree match = matcher.getMatch(); - // do what we want to with the subtree - } +// // Create a reusable pattern object +// TregexPattern patternMW = TregexPattern.compile("__ >># NP"); +// // Run the pattern on one particular tree +// TregexMatcher matcher = patternMW.matcher(tree); +// // Iterate over all of the subtrees that matched +// while (matcher.findNextMatchingNode()) { +// Tree match = matcher.getMatch(); +// // do what we want to with the subtree +// } } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Index.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Index.java 2013-12-10 12:52:52 UTC (rev 4202) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Index.java 2013-12-10 13:16:47 UTC (rev 4203) @@ -21,11 +21,29 @@ * @return set of documents retrieved based on the given query string */ Set<AnnotatedDocument> getDocuments(Entity entity); + + /** + * Returns a set of documents based on how the underlying index is processing the given + * search string. + * + * @param searchString query specifying the documents to retrieve + * @return set of documents retrieved based on the given query string + */ + long getNumberOfDocumentsFor(Entity entity); + + /** + * Returns a set of documents based on how the underlying index is processing the given + * search string. + * + * @param searchString query specifying the documents to retrieve + * @return set of documents retrieved based on the given query string + */ + long getNumberOfDocumentsFor(Entity... entities); /** * Returns the total number of documents contained in the index. * * @return the total number of documents contained in the index */ - int getTotalNumberOfDocuments(); + long getTotalNumberOfDocuments(); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-12-10 12:52:52 UTC (rev 4202) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-12-10 13:16:47 UTC (rev 4203) @@ -5,6 +5,7 @@ import java.util.Set; import org.dllearner.algorithms.isle.index.AnnotatedDocument; +import org.dllearner.algorithms.isle.index.Index; import org.dllearner.core.owl.Entity; /** @@ -14,7 +15,7 @@ * @author Lorenz Buehmann * @author Daniel Fleischhacker */ -public class SemanticIndex extends HashMap<Entity, Set<AnnotatedDocument>>{ +public class SemanticIndex extends HashMap<Entity, Set<AnnotatedDocument>> implements Index{ private int nrOfDocuments; @@ -49,11 +50,33 @@ this.nrOfDocuments = nrOfDocuments; } - /** - * @return the nrOfDocuments + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Index#getTotalNumberOfDocuments() */ - public int getTotalNrOfDocuments() { + @Override + public long getTotalNumberOfDocuments() { return nrOfDocuments; } + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Index#getNumberOfDocumentsFor(org.dllearner.core.owl.Entity) + */ + @Override + public long getNumberOfDocumentsFor(Entity entity) { + return getDocuments(entity).size(); + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Index#getNumberOfDocumentsFor(org.dllearner.core.owl.Entity[]) + */ + @Override + public long getNumberOfDocumentsFor(Entity... entities) { + + Set<AnnotatedDocument> documents = getDocuments(entities[0]); + for (int i = 1; i < entities.length; i++) { + documents.retainAll(getDocuments(entities[i])); + } + return 0; + } + } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java 2013-12-10 12:52:52 UTC (rev 4202) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java 2013-12-10 13:16:47 UTC (rev 4203) @@ -80,7 +80,7 @@ for (Token token : tokens) { try { Query query = parser.parse(token.getRawForm()); - ScoreDoc[] result = searcher.search(query, getTotalNumberOfDocuments()).scoreDocs; + ScoreDoc[] result = searcher.search(query, indexReader.numDocs()).scoreDocs; for (int i = 0; i < result.length; i++) { Document doc = searcher.doc(result[i].doc); documents.add(new AnnotatedTextDocument( @@ -102,7 +102,7 @@ * @see org.dllearner.algorithms.isle.index.Index#getTotalNumberOfDocuments() */ @Override - public int getTotalNumberOfDocuments() { + public long getTotalNumberOfDocuments() { return indexReader.numDocs(); } @@ -120,5 +120,21 @@ return documents; } + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Index#getNumberOfDocumentsFor(org.dllearner.core.owl.Entity) + */ + @Override + public long getNumberOfDocumentsFor(Entity entity) { + return 0; + } + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Index#getNumberOfDocumentsFor(org.dllearner.core.owl.Entity[]) + */ + @Override + public long getNumberOfDocumentsFor(Entity... entities) { + return 0; + } + + } Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/NTriplesFileLuceneSyntacticIndexCreator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/NTriplesFileLuceneSyntacticIndexCreator.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/NTriplesFileLuceneSyntacticIndexCreator.java 2013-12-10 13:16:47 UTC (rev 4203) @@ -0,0 +1,122 @@ +/** + * + */ +package org.dllearner.algorithms.isle.index.syntactic; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; + +import org.apache.jena.riot.Lang; +import org.apache.jena.riot.RiotReader; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexWriterConfig.OpenMode; +import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.Version; + +import com.hp.hpl.jena.graph.Triple; + +/** + * Creates a Lucene Index for the labels if classes and properties. + * @author Lorenz Buehmann + * + */ +public class NTriplesFileLuceneSyntacticIndexCreator { + + public NTriplesFileLuceneSyntacticIndexCreator(InputStream nTriplesStream, String indexPath, String searchField) throws IOException { + //setup the index + Directory directory = FSDirectory.open(new File(indexPath)); + + //setup the index analyzer + Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); + IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer); + indexWriterConfig.setRAMBufferSizeMB(1024.0); + indexWriterConfig.setOpenMode(OpenMode.CREATE); + IndexWriter writer = new IndexWriter(directory, indexWriterConfig); + + System.out.println( "Creating index ..." ); + + // setup the index fields, here two fields, for URI and text + FieldType stringType = new FieldType(StringField.TYPE_STORED); + stringType.setStoreTermVectors(false); + FieldType textType = new FieldType(TextField.TYPE_STORED); + textType.setStoreTermVectors(false); + + Set<Document> documents = new HashSet<Document>(); + + Iterator<Triple> iterator = RiotReader.createIteratorTriples(nTriplesStream, Lang.NTRIPLES, null); + + Triple triple; + String text; + String uri; + Document doc; + int i = 0; + while(iterator.hasNext()){ + triple = iterator.next(); + + uri = triple.getSubject().getURI(); + text = triple.getObject().getLiteralLexicalForm(); + + doc = new Document(); + doc.add(new Field("uri", uri, stringType)); + doc.add(new Field(searchField, text, textType)); + + writer.addDocument(doc); + if(i++ % 10000 == 0){ +// writer.commit(); + System.out.println(i); + } + + } + + writer.commit(); + writer.close(); + } + + public static void main(String[] args) throws Exception { + String indexFile = "/home/me/Documents/short_abstracts_en.nt"; +// indexFile = "/tmp/test.nt"; + String indexPath = "/home/me/Documents/dbpedia/short_abstracts_index"; +// indexPath = "/tmp/index"; + String field = "text"; + new NTriplesFileLuceneSyntacticIndexCreator(new FileInputStream(indexFile), indexPath, field); + + IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath))); + IndexSearcher searcher = new IndexSearcher(reader); + Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); + + QueryParser parser = new QueryParser(Version.LUCENE_43, field, analyzer); + Query query = parser.parse("film AND direction"); + + TopDocs docs = searcher.search(query, 10); + ScoreDoc[] scoreDocs = docs.scoreDocs; + + for (int i = 0; i < scoreDocs.length; i++) { + Document doc = searcher.doc(scoreDocs[i].doc); + System.out.println(doc.get(field)); + + } + } + + +} Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SolrSyntacticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SolrSyntacticIndex.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/SolrSyntacticIndex.java 2013-12-10 13:16:47 UTC (rev 4203) @@ -0,0 +1,176 @@ +/** + * + */ +package org.dllearner.algorithms.isle.index.syntactic; + +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.SolrServer; +import org.apache.solr.client.solrj.SolrServerException; +import org.apache.solr.client.solrj.impl.HttpSolrServer; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; +import org.dllearner.algorithms.isle.TextDocumentGenerator; +import org.dllearner.algorithms.isle.index.AnnotatedDocument; +import org.dllearner.algorithms.isle.index.AnnotatedTextDocument; +import org.dllearner.algorithms.isle.index.Index; +import org.dllearner.algorithms.isle.index.Token; +import org.dllearner.algorithms.isle.textretrieval.AnnotationEntityTextRetriever; +import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever; +import org.dllearner.core.owl.Entity; +import org.semanticweb.owlapi.model.OWLOntology; + +import com.google.common.base.Joiner; + +/** + * @author Lorenz Buehmann + * + */ +public class SolrSyntacticIndex implements Index{ + + private SolrServer solr; + private AnnotationEntityTextRetriever textRetriever; + private String searchField; + + long totalNumberOfDocuments = -1; + + public SolrSyntacticIndex(OWLOntology ontology, String solrServerURL, String searchField) { + this.searchField = searchField; + solr = new HttpSolrServer(solrServerURL); + textRetriever = new RDFSLabelEntityTextRetriever(ontology); + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Index#getDocuments(org.dllearner.core.owl.Entity) + */ + @Override + public Set<AnnotatedDocument> getDocuments(Entity entity) { + Set<AnnotatedDocument> documents = new HashSet<AnnotatedDocument>(); + + Map<List<Token>, Double> relevantText = textRetriever.getRelevantText(entity); + + for (Entry<List<Token>, Double> entry : relevantText.entrySet()) { + List<Token> tokens = entry.getKey(); + for (Token token : tokens) { + SolrQuery query = new SolrQuery(searchField + ":" + token.getRawForm()); + query.setRows(Integer.MAX_VALUE);//can be very slow + try { + QueryResponse response = solr.query(query); + SolrDocumentList list = response.getResults(); + System.out.println(list.getNumFound()); + for (SolrDocument doc : list) { + String uri = (String) doc.getFieldValue("uri"); + String comment = (String) doc.getFieldValue(searchField); + + documents.add(new AnnotatedTextDocument( + TextDocumentGenerator.getInstance().generateDocument((String) doc.getFieldValue(searchField)), + Collections.EMPTY_SET)); + } + } catch (SolrServerException e) { + e.printStackTrace(); + } + } + } + return documents; + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Index#getTotalNumberOfDocuments() + */ + @Override + public long getTotalNumberOfDocuments() { + if(totalNumberOfDocuments == -1){ + SolrQuery q = new SolrQuery("*:*"); + q.setRows(0); // don't actually request any data + try { + totalNumberOfDocuments = solr.query(q).getResults().getNumFound(); + } catch (SolrServerException e) { + e.printStackTrace(); + } + } + return totalNumberOfDocuments; + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Index#getNumberOfDocumentsFor(org.dllearner.core.owl.Entity) + */ + @Override + public long getNumberOfDocumentsFor(Entity entity) { + Map<List<Token>, Double> relevantText = textRetriever.getRelevantText(entity); + + String queryString = "("; + Set<String> terms = new HashSet<>(); + for (Entry<List<Token>, Double> entry : relevantText.entrySet()) { + List<Token> tokens = entry.getKey(); + String phrase = ""; + for (Token token : tokens) { +// terms.add(token.getRawForm()); + phrase += token.getRawForm() + " "; + } + phrase.trim(); + terms.add(phrase); + } + queryString += Joiner.on("OR").join(terms); + queryString += ")"; + + SolrQuery query = new SolrQuery(searchField + ":" + queryString);System.out.println(query); + try { + QueryResponse response = solr.query(query); + SolrDocumentList list = response.getResults(); + return list.getNumFound(); + } catch (SolrServerException e) { + e.printStackTrace(); + } + return -1; + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Index#getNumberOfDocumentsFor(org.dllearner.core.owl.Entity[]) + */ + @Override + public long getNumberOfDocumentsFor(Entity... entities) { + + Set<String> queryStringParts = new HashSet<>(); + + for (Entity entity : entities) { + Map<List<Token>, Double> relevantText = textRetriever.getRelevantText(entity); + + String queryString = "("; + Set<String> terms = new HashSet<>(); + for (Entry<List<Token>, Double> entry : relevantText.entrySet()) { + List<Token> tokens = entry.getKey(); + String phrase = ""; + for (Token token : tokens) { +// terms.add(token.getRawForm()); + phrase += token.getRawForm() + " "; + } + phrase.trim(); + terms.add(phrase); + } + queryString += Joiner.on("OR").join(terms); + queryString += ")"; + queryStringParts.add(queryString); + } + + String queryStringConjuction = "(" + Joiner.on("AND").join(queryStringParts) + ")"; + + + SolrQuery query = new SolrQuery(searchField + ":" + queryStringConjuction);System.out.println(query); + try { + QueryResponse response = solr.query(query); + SolrDocumentList list = response.getResults(); + return list.getNumFound(); + } catch (SolrServerException e) { + e.printStackTrace(); + } + return -1; + } + +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/AbstractRelevanceMetric.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/AbstractRelevanceMetric.java 2013-12-10 12:52:52 UTC (rev 4202) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/AbstractRelevanceMetric.java 2013-12-10 13:16:47 UTC (rev 4203) @@ -6,7 +6,7 @@ import java.util.HashMap; import java.util.Map; -import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; +import org.dllearner.algorithms.isle.index.Index; import org.semanticweb.owlapi.model.OWLEntity; /** @@ -15,9 +15,9 @@ */ public abstract class AbstractRelevanceMetric implements RelevanceMetric { - protected SemanticIndex index; + protected Index index; - public AbstractRelevanceMetric(SemanticIndex index) { + public AbstractRelevanceMetric(Index index) { this.index = index; } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java 2013-12-10 12:52:52 UTC (rev 4202) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/metrics/PMIRelevanceMetric.java 2013-12-10 13:16:47 UTC (rev 4203) @@ -6,7 +6,7 @@ import java.util.Set; import org.dllearner.algorithms.isle.index.AnnotatedDocument; -import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; +import org.dllearner.algorithms.isle.index.Index; import org.dllearner.core.owl.Entity; import com.google.common.collect.Sets; @@ -17,21 +17,22 @@ */ public class PMIRelevanceMetric extends AbstractRelevanceMetric { - public PMIRelevanceMetric(SemanticIndex index) { + public PMIRelevanceMetric(Index index) { super(index); } @Override public double getRelevance(Entity entityA, Entity entityB){ - Set<AnnotatedDocument> documentsA = index.getDocuments(entityA); - Set<AnnotatedDocument> documentsB = index.getDocuments(entityB); - Set<AnnotatedDocument> documentsAB = Sets.intersection(documentsA, documentsB); - int nrOfDocuments = index.getTotalNrOfDocuments(); + long nrOfDocumentsA = index.getNumberOfDocumentsFor(entityA); + long nrOfDocumentsB = index.getNumberOfDocumentsFor(entityB); + long nrOfDocumentsAB = index.getNumberOfDocumentsFor(entityA, entityB); - double pA = nrOfDocuments == 0 ? 0 : ((double) documentsA.size() / (double) nrOfDocuments); - double pB = nrOfDocuments == 0 ? 0 : ((double) documentsB.size() / (double) nrOfDocuments); - double pAB = nrOfDocuments == 0 ? 0 : ((double) documentsAB.size() / (double) nrOfDocuments); + long nrOfDocuments = index.getTotalNumberOfDocuments(); + double pA = nrOfDocuments == 0 ? 0 : ((double) nrOfDocumentsA / (double) nrOfDocuments); + double pB = nrOfDocuments == 0 ? 0 : ((double) nrOfDocumentsB / (double) nrOfDocuments); + double pAB = nrOfDocuments == 0 ? 0 : ((double) nrOfDocumentsAB / (double) nrOfDocuments); + double pmi = Math.log(pAB / pA * pB); return pmi; @@ -42,7 +43,7 @@ Set<AnnotatedDocument> documentsA = index.getDocuments(entityA); Set<AnnotatedDocument> documentsB = index.getDocuments(entityB); Set<AnnotatedDocument> documentsAB = Sets.intersection(documentsA, documentsB); - int nrOfDocuments = index.getTotalNrOfDocuments(); + long nrOfDocuments = index.getTotalNumberOfDocuments(); // System.out.println("A:" + documentsA.size()); // System.out.println("B:" + documentsB.size()); // System.out.println("AB:" + documentsAB.size()); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-12-10 15:25:17
|
Revision: 4207 http://sourceforge.net/p/dl-learner/code/4207 Author: dfleischhacker Date: 2013-12-10 15:25:13 +0000 (Tue, 10 Dec 2013) Log Message: ----------- Add scoring for hyponyms and token tree Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityScorePair.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java 2013-12-10 14:35:02 UTC (rev 4206) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/WordNet.java 2013-12-10 15:25:13 UTC (rev 4207) @@ -13,6 +13,8 @@ public class WordNet { + private static final double SYNONYM_FACTOR = 0.8; + private static final double HYPONYM_FACTOR = 0.4; public Dictionary dict; public WordNet() { @@ -280,6 +282,42 @@ } } + public List<LemmaScorePair> getHyponymsScored(POS pos, String s) { + ArrayList<LemmaScorePair> result = new ArrayList<>(); + try { + IndexWord word = dict.getIndexWord(pos, s); + if (word == null) { + System.err.println("Unable to find index word for " + s); + return result; + } + Synset sense = word.getSense(1); + getHyponymsScoredRecursive(result, sense, 3, SYNONYM_FACTOR); + } + catch (JWNLException e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + } + return result; + } + + public void getHyponymsScoredRecursive(List<LemmaScorePair> lemmas, Synset sense, int depthToGo, double score) { + for (Word w : sense.getWords()) { + lemmas.add(new LemmaScorePair(w.getLemma(), score)); + } + if (depthToGo == 0) { + return; + } + try { + PointerTargetNodeList directHyponyms = PointerUtils.getInstance().getDirectHyponyms(sense); + for (Object directHyponym : directHyponyms) { + getHyponymsScoredRecursive(lemmas, ((PointerTargetNode) directHyponym).getSynset(), depthToGo - 1, + score * HYPONYM_FACTOR); + } + } + catch (JWNLException e) { + e.printStackTrace(); //To change body of catch statement use File | Settings | File Templates. + } + } + /** * Funktion returns a List of Hypo and Hypernyms of a given string * @@ -356,4 +394,71 @@ return result; } + public static class LemmaScorePair implements Comparable<LemmaScorePair> { + private String lemma; + private Double score; + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + LemmaScorePair that = (LemmaScorePair) o; + + if (lemma != null ? !lemma.equals(that.lemma) : that.lemma != null) { + return false; + } + if (score != null ? !score.equals(that.score) : that.score != null) { + return false; + } + + return true; + } + + @Override + public int hashCode() { + int result = lemma != null ? lemma.hashCode() : 0; + result = 31 * result + (score != null ? score.hashCode() : 0); + return result; + } + + public String getLemma() { + + return lemma; + } + + public void setLemma(String lemma) { + this.lemma = lemma; + } + + public Double getScore() { + return score; + } + + public void setScore(Double score) { + this.score = score; + } + + public LemmaScorePair(String lemma, Double score) { + + this.lemma = lemma; + this.score = score; + } + + @Override + public int compareTo(LemmaScorePair o) { + int val = score.compareTo(o.score); + + if (val == 0) { + val = lemma.compareTo(o.getLemma()); + } + + return val; + } + } + } Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityScorePair.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityScorePair.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityScorePair.java 2013-12-10 15:25:13 UTC (rev 4207) @@ -0,0 +1,77 @@ +package org.dllearner.algorithms.isle.index; + +import org.dllearner.core.owl.Entity; + +/** + * Represents a scored entity. The score is produced from the path used to retrieve it from the candidates tree. + * @author Daniel Fleischhacker + */ +public class EntityScorePair implements Comparable<EntityScorePair> { + @Override + public String toString() { + return entity + " : " + score; + } + + private Entity entity; + private Double score; + + @Override + public int compareTo(EntityScorePair o) { + int val = score.compareTo(o.score); + + if (val == 0) { + val = entity.getURI().toString().compareTo(o.entity.getURI().toString()); + } + + return val; + } + + public EntityScorePair(Entity entity, Double score) { + this.entity = entity; + this.score = score; + } + + public Entity getEntity() { + return entity; + } + + public void setEntity(Entity entity) { + this.entity = entity; + } + + public Double getScore() { + return score; + } + + public void setScore(Double score) { + this.score = score; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + + EntityScorePair that = (EntityScorePair) o; + + if (entity != null ? !entity.equals(that.entity) : that.entity != null) { + return false; + } + if (score != null ? !score.equals(that.score) : that.score != null) { + return false; + } + + return true; + } + + @Override + public int hashCode() { + int result = entity != null ? entity.hashCode() : 0; + result = 31 * result + (score != null ? score.hashCode() : 0); + return result; + } +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-12-10 14:35:02 UTC (rev 4206) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-12-10 15:25:13 UTC (rev 4207) @@ -5,8 +5,7 @@ import net.didion.jwnl.data.POS; import org.dllearner.algorithms.isle.WordNet; -import java.util.ArrayList; -import java.util.Collections; +import java.util.*; /** * Provides shortcuts to commonly used linguistic operations @@ -35,6 +34,26 @@ } } + public Set<WordNet.LemmaScorePair> getScoredHyponyms(String word, POS pos) { + List<WordNet.LemmaScorePair> pairs = wn.getHyponymsScored(pos, word); + HashMap<String, Double> lemmaScores = new HashMap<>(); + for (WordNet.LemmaScorePair p : pairs) { + if (!lemmaScores.containsKey(p.getLemma())) { + lemmaScores.put(p.getLemma(), p.getScore()); + } + else { + lemmaScores.put(p.getLemma(), Math.max(p.getScore(), lemmaScores.get(p.getLemma()))); + } + } + + TreeSet<WordNet.LemmaScorePair> scoredPairs = new TreeSet<>(); + for (Map.Entry<String, Double> e : lemmaScores.entrySet()) { + scoredPairs.add(new WordNet.LemmaScorePair(e.getKey(), e.getValue())); + } + + return scoredPairs; + } + /** * Processes the given string and puts camelCased words into single words. * @param camelCase the word containing camelcase to split Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-10 14:35:02 UTC (rev 4206) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-10 15:25:13 UTC (rev 4207) @@ -1,6 +1,7 @@ package org.dllearner.algorithms.isle.index; import net.didion.jwnl.data.POS; +import org.dllearner.algorithms.isle.WordNet; import org.dllearner.algorithms.isle.textretrieval.EntityTextRetriever; import org.dllearner.core.owl.Entity; import org.semanticweb.owlapi.model.OWLOntology; @@ -89,15 +90,16 @@ continue; } //String[] synonyms = LinguisticUtil.getInstance().getSynonymsForWord(t.getRawForm(), wordnetPos); - String[] synonyms = LinguisticUtil.getInstance().getAllHyponymsForWord(t.getRawForm(), wordnetPos); + Set<WordNet.LemmaScorePair> alternativeFormPairs = LinguisticUtil.getInstance() + .getScoredHyponyms(t.getRawForm(), wordnetPos); - for (String synonym : synonyms) { + for (WordNet.LemmaScorePair synonym : alternativeFormPairs) { // ignore all multi word synonyms - if (synonym.contains("_")) { + if (synonym.getLemma().contains("_")) { continue; } //t.addAlternativeForm(LinguisticUtil.getInstance().getNormalizedForm(synonym)); - t.addAlternativeForm(synonym); + t.addAlternativeForm(synonym.getLemma(), synonym.getScore()); } } } @@ -113,9 +115,14 @@ @Override public Set<Entity> getCandidateEntities(List<Token> tokens) { - return tree.getAllEntities(tokens); - } + Set<Entity> res = tree.getAllEntities(tokens); + System.out.println("Unscored: " + res); + Set<EntityScorePair> scored = tree.getAllEntitiesScored(tokens); + System.out.println("Scored: " + scored); + return res; + } + @Override public List<Token> getGeneratingStringForLongestMatch(List<Token> tokens) { return tree.getOriginalTokensForLongestMatch(tokens); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-10 14:35:02 UTC (rev 4206) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-10 15:25:13 UTC (rev 4207) @@ -7,7 +7,8 @@ import java.io.Serializable; import java.util.Collections; -import java.util.HashSet; +import java.util.HashMap; +import java.util.Map; import java.util.Set; /** @@ -23,7 +24,8 @@ private boolean isStopWord; private boolean isHead; /// for storing alternative forms of this token, e.g., generated by WordNet synonyms - private HashSet<String> alternativeForms; + private HashMap<String, Double> alternativeForms; + public Token(String rawForm) { this.rawForm = rawForm; @@ -35,7 +37,7 @@ this.posTag = posTag; this.isPunctuation = isPunctuation; this.isStopWord = isStopWord; - this.alternativeForms = new HashSet<>(); + this.alternativeForms = new HashMap<>(); } /** @@ -66,15 +68,22 @@ * @return unmodifiable set of alternative surface forms for this token */ public Set<String> getAlternativeForms() { - return Collections.unmodifiableSet(alternativeForms); + return Collections.unmodifiableSet(alternativeForms.keySet()); } /** + * Returns the map storing the scored alternative forms of this token. + */ + public Map<String, Double> getScoredAlternativeForms() { + return Collections.unmodifiableMap(alternativeForms); + } + + /** * Adds a new surface form to the alternative forms of this token. Alternative forms are included in comparison of * two tokens when using the {@link #equalsWithAlternativeForms}. */ - public void addAlternativeForm(String alternativeForm) { - this.alternativeForms.add(alternativeForm); + public void addAlternativeForm(String alternativeForm, Double score) { + this.alternativeForms.put(alternativeForm, score); } /** @@ -120,7 +129,7 @@ } /** - * @param wheteher the token is the head of the containg sequence of tokens + * @param isHead the token is the head of the containg sequence of tokens */ public void setIsHead(boolean isHead) { this.isHead = isHead; @@ -158,8 +167,8 @@ return false; } - if (other.stemmedForm.equals(stemmedForm) || other.alternativeForms.contains(stemmedForm) || - alternativeForms.contains(other.stemmedForm)) { + if (other.stemmedForm.equals(stemmedForm) || other.alternativeForms.containsKey(stemmedForm) || + alternativeForms.containsKey(other.stemmedForm)) { return true; } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-10 14:35:02 UTC (rev 4206) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-10 15:25:13 UTC (rev 4207) @@ -13,6 +13,9 @@ * @author Daniel Fleischhacker */ public class TokenTree { + public static final double WORDNET_FACTOR = 0.3d; + public static final double ORIGINAL_FACTOR = 1.0d; + private LinkedHashMap<Token, TokenTree> children; private Set<Entity> entities; private List<Token> originalTokens; @@ -23,14 +26,15 @@ this.entities = new HashSet<>(); this.originalTokens = new ArrayList<>(); } - + /** * If set to TRUE, stopwords like 'of, on' are ignored during creation and retrieval operations. - * @param ignoreStopWords the ignoreStopWords to set - */ - public void setIgnoreStopWords(boolean ignoreStopWords) { - this.ignoreStopWords = ignoreStopWords; - } + * + * @param ignoreStopWords the ignoreStopWords to set + */ + public void setIgnoreStopWords(boolean ignoreStopWords) { + this.ignoreStopWords = ignoreStopWords; + } /** * Adds all given entities to the end of the path resulting from the given tokens. @@ -41,14 +45,14 @@ public void add(List<Token> tokens, Set<Entity> entities, List<Token> originalTokens) { TokenTree curNode = this; for (Token t : tokens) { - if(!ignoreStopWords || (ignoreStopWords && !t.isStopWord())){ - TokenTree nextNode = curNode.children.get(t); + if (!ignoreStopWords || (ignoreStopWords && !t.isStopWord())) { + TokenTree nextNode = curNode.children.get(t); if (nextNode == null) { nextNode = new TokenTree(); curNode.children.put(t, nextNode); } curNode = nextNode; - } + } } curNode.entities.addAll(entities); curNode.originalTokens = new ArrayList<>(originalTokens); @@ -90,6 +94,75 @@ return curNode.entities; } + public Set<EntityScorePair> getAllEntitiesScored(List<Token> tokens) { + HashSet<EntityScorePair> resEntities = new HashSet<>(); + getAllEntitiesScoredRec(tokens, 0, this, resEntities, 1.0); + + // only keep highest confidence for each entity + HashMap<Entity, Double> entityScores = new HashMap<>(); + + for (EntityScorePair p : resEntities) { + if (!entityScores.containsKey(p.getEntity())) { + entityScores.put(p.getEntity(), p.getScore()); + } + else { + entityScores.put(p.getEntity(), Math.max(p.getScore(), entityScores.get(p.getEntity()))); + } + } + + TreeSet<EntityScorePair> result = new TreeSet<>(); + for (Map.Entry<Entity, Double> e : entityScores.entrySet()) { + result.add(new EntityScorePair(e.getKey(), e.getValue())); + } + + return result; + } + + public void getAllEntitiesScoredRec(List<Token> tokens, int curPosition, TokenTree curTree, + HashSet<EntityScorePair> resEntities, Double curScore) { + + if (curPosition == tokens.size()) { + for (Entity e : curTree.entities) { + resEntities.add(new EntityScorePair(e, curScore)); + } + return; + } + Token currentTextToken = tokens.get(curPosition); + for (Map.Entry<Token, TokenTree> treeTokenEntry : curTree.children.entrySet()) { + if (currentTextToken.equals(treeTokenEntry.getKey())) { + getAllEntitiesScoredRec(tokens, curPosition + 1, treeTokenEntry.getValue(), resEntities, + curScore * ORIGINAL_FACTOR); + } + else { + for (Map.Entry<String, Double> treeAlternativeForm : treeTokenEntry.getKey().getScoredAlternativeForms() + .entrySet()) { + if (currentTextToken.getStemmedForm().equals(treeAlternativeForm.getKey())) { + getAllEntitiesScoredRec(tokens, curPosition + 1, treeTokenEntry.getValue(), resEntities, + curScore * ORIGINAL_FACTOR * treeAlternativeForm.getValue()); + } + } + for (Map.Entry<String, Double> textAlternativeForm : currentTextToken.getScoredAlternativeForms() + .entrySet()) { + if (treeTokenEntry.getKey().getStemmedForm().equals(textAlternativeForm.getKey())) { + getAllEntitiesScoredRec(tokens, curPosition + 1, treeTokenEntry.getValue(), resEntities, + curScore * ORIGINAL_FACTOR * textAlternativeForm.getValue()); + } + } + + for (Map.Entry<String, Double> treeAlternativeForm : treeTokenEntry.getKey().getScoredAlternativeForms() + .entrySet()) { + for (Map.Entry<String, Double> textAlternativeForm : currentTextToken.getScoredAlternativeForms() + .entrySet()) { + if (treeAlternativeForm.getKey().equals(textAlternativeForm.getKey())) { + getAllEntitiesScoredRec(tokens, curPosition + 1, treeTokenEntry.getValue(), resEntities, + curScore * treeAlternativeForm.getValue() * textAlternativeForm.getValue()); + } + } + } + } + } + } + public Set<Entity> getAllEntities(List<Token> tokens) { HashSet<Entity> resEntities = new HashSet<>(); getAllEntitiesRec(tokens, 0, this, resEntities); @@ -145,7 +218,8 @@ /** * Returns the set of entities assigned to the longest matching token subsequence of the given token sequence. - * @param tokens token sequence to search for longest match + * + * @param tokens token sequence to search for longest match * @return set of entities assigned to the longest matching token subsequence of the given token sequence */ public Set<Entity> getEntitiesForLongestMatch(List<Token> tokens) { @@ -188,34 +262,37 @@ } public static void main(String[] args) throws Exception { - List<Token> tokens1 = Lists.newLinkedList(); - for (String s : Splitter.on(" ").split("this is a token tree")) { - tokens1.add(new Token(s, s, s, false, false)); - }; - - List<Token> tokens2 = Lists.newLinkedList(); - for (String s : Splitter.on(" ").split("this is a tokenized tree")) { - tokens2.add(new Token(s, s, s, false, false)); - }; - - TokenTree tree = new TokenTree(); - tree.add(tokens1, new NamedClass("TokenTree")); - tree.add(tokens2, new NamedClass("TokenizedTree")); + List<Token> tokens1 = Lists.newLinkedList(); + for (String s : Splitter.on(" ").split("this is a token tree")) { + tokens1.add(new Token(s, s, s, false, false)); + } + ; + + List<Token> tokens2 = Lists.newLinkedList(); + for (String s : Splitter.on(" ").split("this is a tokenized tree")) { + tokens2.add(new Token(s, s, s, false, false)); + } + ; + + TokenTree tree = new TokenTree(); + tree.add(tokens1, new NamedClass("TokenTree")); + tree.add(tokens2, new NamedClass("TokenizedTree")); System.out.println(tree); - + System.out.println(tree.getEntitiesForLongestMatch(tokens1)); System.out.println(tree.getLongestMatch(tokens1)); - + List<Token> tokens3 = Lists.newLinkedList(); - for (String s : Splitter.on(" ").split("this is a very nice tokenized tree")) { - tokens3.add(new Token(s, s, s, false, false)); - }; + for (String s : Splitter.on(" ").split("this is a very nice tokenized tree")) { + tokens3.add(new Token(s, s, s, false, false)); + } + ; System.out.println(tree.getLongestMatch(tokens3)); } - + public String toString() { - return "TokenTree\n"+ toString(0); + return "TokenTree\n" + toString(0); } public String toString(int indent) { @@ -233,5 +310,5 @@ return sb.toString(); } - + } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-12-10 15:41:39
|
Revision: 4208 http://sourceforge.net/p/dl-learner/code/4208 Author: dfleischhacker Date: 2013-12-10 15:41:36 +0000 (Tue, 10 Dec 2013) Log Message: ----------- Adapt WSD interfaces to scored candidates Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityCandidateGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidateGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/RandomWordSenseDisambiguation.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SimpleWordSenseDisambiguation.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/StructureBasedWordSenseDisambiguation.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WordSenseDisambiguation.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityCandidateGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityCandidateGenerator.java 2013-12-10 15:25:13 UTC (rev 4207) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/EntityCandidateGenerator.java 2013-12-10 15:41:36 UTC (rev 4208) @@ -3,13 +3,14 @@ */ package org.dllearner.algorithms.isle; -import java.util.HashMap; -import java.util.Set; - import org.dllearner.algorithms.isle.index.Annotation; +import org.dllearner.algorithms.isle.index.EntityScorePair; import org.dllearner.core.owl.Entity; import org.semanticweb.owlapi.model.OWLOntology; +import java.util.HashMap; +import java.util.Set; + /** * @author Lorenz Buehmann * @@ -22,8 +23,8 @@ this.ontology = ontology; } - public abstract Set<Entity> getCandidates(Annotation annotation); + public abstract Set<EntityScorePair> getCandidates(Annotation annotation); - public abstract HashMap<Annotation,Set<Entity>> getCandidatesMap(Set<Annotation> annotations); + public abstract HashMap<Annotation,Set<EntityScorePair>> getCandidatesMap(Set<Annotation> annotations); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java 2013-12-10 15:25:13 UTC (rev 4207) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java 2013-12-10 15:41:36 UTC (rev 4208) @@ -17,10 +17,9 @@ /** * Gets set of candidate entities for a list of tokens - * @param s * @return */ - public Set<Entity> getCandidateEntities(List<Token> tokens); + public Set<EntityScorePair> getCandidateEntities(List<Token> tokens); /** @@ -28,14 +27,12 @@ * ontology string when the parameter string has been added to the trie after generation by using * WordNet or other additional methods. * - * @param s the string to search in the trie * @return string generating the path of the longest match in the trie */ public List<Token> getGeneratingStringForLongestMatch(List<Token> tokens); /** * Gets the longest matching string - * @param s * @return */ public List<Token> getLongestMatchingText(List<Token> tokens); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java 2013-12-10 15:25:13 UTC (rev 4207) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java 2013-12-10 15:41:36 UTC (rev 4208) @@ -1,13 +1,12 @@ package org.dllearner.algorithms.isle.index; +import org.dllearner.algorithms.isle.EntityCandidateGenerator; +import org.dllearner.algorithms.isle.wsd.WordSenseDisambiguation; + import java.util.HashMap; import java.util.HashSet; import java.util.Set; -import org.dllearner.algorithms.isle.EntityCandidateGenerator; -import org.dllearner.algorithms.isle.wsd.WordSenseDisambiguation; -import org.dllearner.core.owl.Entity; - /** * Provides methods to annotate documents. * @@ -23,7 +22,6 @@ /** * Initialize this semantic annotator to use the entities from the provided ontology. * - * @param ontology the ontology to use entities from */ public SemanticAnnotator(WordSenseDisambiguation wordSenseDisambiguation, EntityCandidateGenerator entityCandidateGenerator, LinguisticAnnotator linguisticAnnotator) { @@ -41,9 +39,9 @@ public AnnotatedDocument processDocument(TextDocument document){ Set<Annotation> annotations = linguisticAnnotator.annotate(document); Set<SemanticAnnotation> semanticAnnotations = new HashSet<SemanticAnnotation>(); - HashMap<Annotation,Set<Entity>> candidatesMap = entityCandidateGenerator.getCandidatesMap(annotations); + HashMap<Annotation, Set<EntityScorePair>> candidatesMap = entityCandidateGenerator.getCandidatesMap(annotations); for (Annotation annotation : candidatesMap.keySet()) { - Set<Entity> candidateEntities = candidatesMap.get(annotation); + Set<EntityScorePair> candidateEntities = candidatesMap.get(annotation); if (candidateEntities == null || candidateEntities.size() == 0) { continue; } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidateGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidateGenerator.java 2013-12-10 15:25:13 UTC (rev 4207) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidateGenerator.java 2013-12-10 15:41:36 UTC (rev 4208) @@ -3,16 +3,16 @@ */ package org.dllearner.algorithms.isle.index; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Set; - import org.dllearner.algorithms.isle.EntityCandidateGenerator; import org.dllearner.core.owl.Entity; import org.dllearner.utilities.owl.OWLAPIConverter; import org.semanticweb.owlapi.model.OWLEntity; import org.semanticweb.owlapi.model.OWLOntology; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + /** * @author Lorenz Buehmann * @@ -36,13 +36,17 @@ * @see org.dllearner.algorithms.isle.EntityCandidateGenerator#getCandidates(org.dllearner.algorithms.isle.index.Annotation) */ @Override - public Set<Entity> getCandidates(Annotation annotation) { - return allEntities; - } + public Set<EntityScorePair> getCandidates(Annotation annotation) { + HashSet<EntityScorePair> result = new HashSet<>(); + for (Entity e : allEntities) { + result.add(new EntityScorePair(e, 1.0)); + } + return result; + } @Override - public HashMap<Annotation, Set<Entity>> getCandidatesMap(Set<Annotation> annotations) { - HashMap<Annotation, Set<Entity>> result = new HashMap<Annotation, Set<Entity>>(); + public HashMap<Annotation, Set<EntityScorePair>> getCandidatesMap(Set<Annotation> annotations) { + HashMap<Annotation, Set<EntityScorePair>> result = new HashMap<Annotation, Set<EntityScorePair>>(); for (Annotation annotation: annotations) result.put(annotation, getCandidates(annotation)); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-10 15:25:13 UTC (rev 4207) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-10 15:41:36 UTC (rev 4208) @@ -1,160 +1,156 @@ -package org.dllearner.algorithms.isle.index; - -import net.didion.jwnl.data.POS; -import org.dllearner.algorithms.isle.WordNet; -import org.dllearner.algorithms.isle.textretrieval.EntityTextRetriever; -import org.dllearner.core.owl.Entity; -import org.semanticweb.owlapi.model.OWLOntology; - -import java.util.*; -import java.util.Map.Entry; - -public class SimpleEntityCandidatesTrie implements EntityCandidatesTrie { - TokenTree tree; - EntityTextRetriever entityTextRetriever; - -// /** -// * Initialize the trie with strings from the provided ontology using a no-op name generator, i.e., only the -// * actual ontology strings are added and no expansion is done. -// * -// * @param entityTextRetriever the text retriever to use -// * @param ontology the ontology to get strings from -// */ -// public SimpleEntityCandidatesTrie(EntityTextRetriever entityTextRetriever, OWLOntology ontology) { -// this(entityTextRetriever, ontology, new DummyNameGenerator()); -// } - - /** - * Initialize the trie with strings from the provided ontology and use the given entity name generator - * for generating alternative words. - * - * @param entityTextRetriever the text retriever to use - * @param ontology the ontology to get strings from - */ - public SimpleEntityCandidatesTrie(EntityTextRetriever entityTextRetriever, OWLOntology ontology) { - this.entityTextRetriever = entityTextRetriever; - buildTrie(ontology); - } - - public void buildTrie(OWLOntology ontology) { - this.tree = new TokenTree(); - Map<Entity, Set<List<Token>>> entity2TokenSet = entityTextRetriever.getRelevantText(ontology); - - - for (Entry<Entity, Set<List<Token>>> entry : entity2TokenSet.entrySet()) { - Entity entity = entry.getKey(); - Set<List<Token>> tokenSet = entry.getValue(); - for (List<Token> tokens : tokenSet) { - addAlternativeFormsFromWordNet(tokens); - addEntry(tokens, entity); - addSubsequences(entity, tokens); - } - } - } - - /** - * Adds the subsequences of a test - * @param entity - * @param tokens - */ - private void addSubsequences(Entity entity, List<Token> tokens) { - tree.add(tokens, entity); - for (int size = 1; size < tokens.size(); size++) { - for (int start = 0; start < tokens.size() - size + 1; start++) { - ArrayList<Token> subsequence = new ArrayList<>(); - for (int i = 0; i < size; i++) { - subsequence.add(tokens.get(start + i)); - } - addEntry(subsequence, entity); - } - } - } - - private void addAlternativeFormsFromWordNet(List<Token> tokens) { - for (Token t : tokens) { - POS wordnetPos = null; - String posTag = t.getPOSTag(); - if (posTag.startsWith("N")) {//nouns - wordnetPos = POS.NOUN; - } - else if (posTag.startsWith("V")) {//verbs - wordnetPos = POS.VERB; - } - else if (posTag.startsWith("J")) {//adjectives - wordnetPos = POS.ADJECTIVE; - } - else if (posTag.startsWith("R")) {//adverbs - wordnetPos = POS.ADVERB; - } - if (wordnetPos == null) { - continue; - } - //String[] synonyms = LinguisticUtil.getInstance().getSynonymsForWord(t.getRawForm(), wordnetPos); - Set<WordNet.LemmaScorePair> alternativeFormPairs = LinguisticUtil.getInstance() - .getScoredHyponyms(t.getRawForm(), wordnetPos); - - for (WordNet.LemmaScorePair synonym : alternativeFormPairs) { - // ignore all multi word synonyms - if (synonym.getLemma().contains("_")) { - continue; - } - //t.addAlternativeForm(LinguisticUtil.getInstance().getNormalizedForm(synonym)); - t.addAlternativeForm(synonym.getLemma(), synonym.getScore()); - } - } - } - - @Override - public void addEntry(List<Token> s, Entity e) { - tree.add(s, e); - } - - public void addEntry(List<Token> s, Entity e, List<Token> originalTokens) { - tree.add(s, e, originalTokens); - } - - @Override - public Set<Entity> getCandidateEntities(List<Token> tokens) { - Set<Entity> res = tree.getAllEntities(tokens); - System.out.println("Unscored: " + res); - Set<EntityScorePair> scored = tree.getAllEntitiesScored(tokens); - System.out.println("Scored: " + scored); - - return res; - } - - @Override - public List<Token> getGeneratingStringForLongestMatch(List<Token> tokens) { - return tree.getOriginalTokensForLongestMatch(tokens); - } - - @Override - public List<Token> getLongestMatchingText(List<Token> tokens) { - return tree.getLongestMatch(tokens); - } - - public String toString() { - return tree.toString(); - } - - public static void main(String[] args) { - String[] tokens = "this is a long and very complex text".split(" "); - - List<String>[] wordnetTokens = (ArrayList<String>[]) new ArrayList[tokens.length]; - - // generate list of lemmatized wordnet synonyms for each token - for (int i = 0; i < tokens.length; i++) { - wordnetTokens[i] = new ArrayList<String>(); - wordnetTokens[i].add(LinguisticUtil.getInstance().getNormalizedForm(tokens[i])); - for (String w : LinguisticUtil.getInstance().getTopSynonymsForWord(tokens[i], 5)) { - System.out.println("Adding: " + LinguisticUtil.getInstance().getNormalizedForm(w)); - wordnetTokens[i].add(LinguisticUtil.getInstance().getNormalizedForm(w).replaceAll("_", " ")); - } - } - } - - public void printTrie() { - System.out.println(this.toString()); - - } -} +package org.dllearner.algorithms.isle.index; + +import net.didion.jwnl.data.POS; +import org.dllearner.algorithms.isle.WordNet; +import org.dllearner.algorithms.isle.textretrieval.EntityTextRetriever; +import org.dllearner.core.owl.Entity; +import org.semanticweb.owlapi.model.OWLOntology; + +import java.util.*; +import java.util.Map.Entry; + +public class SimpleEntityCandidatesTrie implements EntityCandidatesTrie { + TokenTree tree; + EntityTextRetriever entityTextRetriever; + +// /** +// * Initialize the trie with strings from the provided ontology using a no-op name generator, i.e., only the +// * actual ontology strings are added and no expansion is done. +// * +// * @param entityTextRetriever the text retriever to use +// * @param ontology the ontology to get strings from +// */ +// public SimpleEntityCandidatesTrie(EntityTextRetriever entityTextRetriever, OWLOntology ontology) { +// this(entityTextRetriever, ontology, new DummyNameGenerator()); +// } + + /** + * Initialize the trie with strings from the provided ontology and use the given entity name generator + * for generating alternative words. + * + * @param entityTextRetriever the text retriever to use + * @param ontology the ontology to get strings from + */ + public SimpleEntityCandidatesTrie(EntityTextRetriever entityTextRetriever, OWLOntology ontology) { + this.entityTextRetriever = entityTextRetriever; + buildTrie(ontology); + } + + public void buildTrie(OWLOntology ontology) { + this.tree = new TokenTree(); + Map<Entity, Set<List<Token>>> entity2TokenSet = entityTextRetriever.getRelevantText(ontology); + + + for (Entry<Entity, Set<List<Token>>> entry : entity2TokenSet.entrySet()) { + Entity entity = entry.getKey(); + Set<List<Token>> tokenSet = entry.getValue(); + for (List<Token> tokens : tokenSet) { + addAlternativeFormsFromWordNet(tokens); + addEntry(tokens, entity); + addSubsequences(entity, tokens); + } + } + } + + /** + * Adds the subsequences of a test + * @param entity + * @param tokens + */ + private void addSubsequences(Entity entity, List<Token> tokens) { + tree.add(tokens, entity); + for (int size = 1; size < tokens.size(); size++) { + for (int start = 0; start < tokens.size() - size + 1; start++) { + ArrayList<Token> subsequence = new ArrayList<>(); + for (int i = 0; i < size; i++) { + subsequence.add(tokens.get(start + i)); + } + addEntry(subsequence, entity); + } + } + } + + private void addAlternativeFormsFromWordNet(List<Token> tokens) { + for (Token t : tokens) { + POS wordnetPos = null; + String posTag = t.getPOSTag(); + if (posTag.startsWith("N")) {//nouns + wordnetPos = POS.NOUN; + } + else if (posTag.startsWith("V")) {//verbs + wordnetPos = POS.VERB; + } + else if (posTag.startsWith("J")) {//adjectives + wordnetPos = POS.ADJECTIVE; + } + else if (posTag.startsWith("R")) {//adverbs + wordnetPos = POS.ADVERB; + } + if (wordnetPos == null) { + continue; + } + //String[] synonyms = LinguisticUtil.getInstance().getSynonymsForWord(t.getRawForm(), wordnetPos); + Set<WordNet.LemmaScorePair> alternativeFormPairs = LinguisticUtil.getInstance() + .getScoredHyponyms(t.getRawForm(), wordnetPos); + + for (WordNet.LemmaScorePair synonym : alternativeFormPairs) { + // ignore all multi word synonyms + if (synonym.getLemma().contains("_")) { + continue; + } + //t.addAlternativeForm(LinguisticUtil.getInstance().getNormalizedForm(synonym)); + t.addAlternativeForm(synonym.getLemma(), synonym.getScore()); + } + } + } + + @Override + public void addEntry(List<Token> s, Entity e) { + tree.add(s, e); + } + + public void addEntry(List<Token> s, Entity e, List<Token> originalTokens) { + tree.add(s, e, originalTokens); + } + + @Override + public Set<EntityScorePair> getCandidateEntities(List<Token> tokens) { + Set<EntityScorePair> res = tree.getAllEntitiesScored(tokens); + return res; + } + + @Override + public List<Token> getGeneratingStringForLongestMatch(List<Token> tokens) { + return tree.getOriginalTokensForLongestMatch(tokens); + } + + @Override + public List<Token> getLongestMatchingText(List<Token> tokens) { + return tree.getLongestMatch(tokens); + } + + public String toString() { + return tree.toString(); + } + + public static void main(String[] args) { + String[] tokens = "this is a long and very complex text".split(" "); + + List<String>[] wordnetTokens = (ArrayList<String>[]) new ArrayList[tokens.length]; + + // generate list of lemmatized wordnet synonyms for each token + for (int i = 0; i < tokens.length; i++) { + wordnetTokens[i] = new ArrayList<String>(); + wordnetTokens[i].add(LinguisticUtil.getInstance().getNormalizedForm(tokens[i])); + for (String w : LinguisticUtil.getInstance().getTopSynonymsForWord(tokens[i], 5)) { + System.out.println("Adding: " + LinguisticUtil.getInstance().getNormalizedForm(w)); + wordnetTokens[i].add(LinguisticUtil.getInstance().getNormalizedForm(w).replaceAll("_", " ")); + } + } + } + + public void printTrie() { + System.out.println(this.toString()); + + } +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-12-10 15:25:13 UTC (rev 4207) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-12-10 15:41:36 UTC (rev 4208) @@ -3,7 +3,6 @@ import com.google.common.collect.Lists; import org.dllearner.algorithms.isle.EntityCandidateGenerator; import org.dllearner.algorithms.isle.StopWordFilter; -import org.dllearner.core.owl.Entity; import org.semanticweb.owlapi.model.OWLOntology; import java.util.ArrayList; @@ -27,8 +26,8 @@ this.candidatesTrie = candidatesTrie; } - public Set<Entity> getCandidates(Annotation annotation) { - Set<Entity> candidateEntities = candidatesTrie.getCandidateEntities(annotation.getTokens()); + public Set<EntityScorePair> getCandidates(Annotation annotation) { + Set<EntityScorePair> candidateEntities = candidatesTrie.getCandidateEntities(annotation.getTokens()); System.out.println(annotation + " --> " + candidateEntities); return candidateEntities; } @@ -39,7 +38,7 @@ * @param window : maximum distance between the annotations * @return */ - public void postProcess(HashMap<Annotation,Set<Entity>> candidatesMap, int window, StopWordFilter stopWordFilter) { + public void postProcess(HashMap<Annotation,Set<EntityScorePair>> candidatesMap, int window, StopWordFilter stopWordFilter) { Set<Annotation> annotations = candidatesMap.keySet(); List<Annotation> sortedAnnotations = new ArrayList<Annotation>(annotations); //TODO refactoring @@ -119,8 +118,8 @@ } @Override - public HashMap<Annotation, Set<Entity>> getCandidatesMap(Set<Annotation> annotations) { - HashMap<Annotation, Set<Entity>> candidatesMap = new HashMap<Annotation, Set<Entity>>(); + public HashMap<Annotation, Set<EntityScorePair>> getCandidatesMap(Set<Annotation> annotations) { + HashMap<Annotation, Set<EntityScorePair>> candidatesMap = new HashMap<>(); for (Annotation annotation: annotations) candidatesMap.put(annotation, getCandidates(annotation)); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/RandomWordSenseDisambiguation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/RandomWordSenseDisambiguation.java 2013-12-10 15:25:13 UTC (rev 4207) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/RandomWordSenseDisambiguation.java 2013-12-10 15:41:36 UTC (rev 4208) @@ -18,14 +18,15 @@ */ package org.dllearner.algorithms.isle.wsd; -import java.util.Random; -import java.util.Set; - import org.dllearner.algorithms.isle.index.Annotation; +import org.dllearner.algorithms.isle.index.EntityScorePair; import org.dllearner.algorithms.isle.index.SemanticAnnotation; import org.dllearner.core.owl.Entity; import org.semanticweb.owlapi.model.OWLOntology; +import java.util.Random; +import java.util.Set; + /** * Disambiguation by randomly selecting one of the candidates (baseline method). * @@ -43,17 +44,17 @@ @Override public SemanticAnnotation disambiguate(Annotation annotation, - Set<Entity> candidateEntities) { + Set<EntityScorePair> candidateEntities) { int pos = random.nextInt(candidateEntities.size()); int i = 0; - for(Entity e : candidateEntities) - { - if (i == pos) { - return new SemanticAnnotation(annotation, e); - } - i++; - } - return null; + for(EntityScorePair esp : candidateEntities) { + Entity e = esp.getEntity(); + if (i == pos) { + return new SemanticAnnotation(annotation, e); + } + i++; + } + return null; } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SimpleWordSenseDisambiguation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SimpleWordSenseDisambiguation.java 2013-12-10 15:25:13 UTC (rev 4207) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/SimpleWordSenseDisambiguation.java 2013-12-10 15:41:36 UTC (rev 4208) @@ -3,26 +3,20 @@ */ package org.dllearner.algorithms.isle.wsd; -import java.util.HashSet; -import java.util.Set; - import org.apache.log4j.Logger; import org.dllearner.algorithms.isle.index.Annotation; +import org.dllearner.algorithms.isle.index.EntityScorePair; import org.dllearner.algorithms.isle.index.SemanticAnnotation; import org.dllearner.core.owl.Entity; import org.dllearner.utilities.owl.OWLAPIConverter; -import org.semanticweb.owlapi.model.IRI; -import org.semanticweb.owlapi.model.OWLAnnotationAssertionAxiom; -import org.semanticweb.owlapi.model.OWLAnnotationProperty; -import org.semanticweb.owlapi.model.OWLDataFactory; -import org.semanticweb.owlapi.model.OWLEntity; -import org.semanticweb.owlapi.model.OWLLiteral; -import org.semanticweb.owlapi.model.OWLOntology; +import org.semanticweb.owlapi.model.*; import org.semanticweb.owlapi.util.IRIShortFormProvider; import org.semanticweb.owlapi.util.SimpleIRIShortFormProvider; - import uk.ac.manchester.cs.owl.owlapi.OWLDataFactoryImpl; +import java.util.HashSet; +import java.util.Set; + /** * @author Lorenz Buehmann * @@ -47,26 +41,27 @@ * @see org.dllearner.algorithms.isle.WordSenseDisambiguation#disambiguate(org.dllearner.algorithms.isle.index.Annotation, java.util.Set) */ @Override - public SemanticAnnotation disambiguate(Annotation annotation, Set<Entity> candidateEntities) { + public SemanticAnnotation disambiguate(Annotation annotation, Set<EntityScorePair> candidateEntities) { logger.debug("Linguistic annotations:\n" + annotation); logger.debug("Candidate entities:" + candidateEntities); String token = annotation.getString().trim(); //check if annotation token matches label of entity or the part behind #(resp. /) - for (Entity entity : candidateEntities) { - Set<String> labels = getLabels(entity); - for (String label : labels) { - if(label.equals(token)){ - logger.debug("Disambiguated entity: " + entity); - return new SemanticAnnotation(annotation, entity); - } - } - String shortForm = sfp.getShortForm(IRI.create(entity.getURI())); - if(annotation.equals(shortForm)){ - logger.debug("Disambiguated entity: " + entity); - return new SemanticAnnotation(annotation, entity); - } - } - return null; + for (EntityScorePair entityScorePair : candidateEntities) { + Entity entity = entityScorePair.getEntity(); + Set<String> labels = getLabels(entity); + for (String label : labels) { + if (label.equals(token)) { + logger.debug("Disambiguated entity: " + entity); + return new SemanticAnnotation(annotation, entity); + } + } + String shortForm = sfp.getShortForm(IRI.create(entity.getURI())); + if (annotation.equals(shortForm)) { + logger.debug("Disambiguated entity: " + entity); + return new SemanticAnnotation(annotation, entity); + } + } + return null; } private Set<String> getLabels(Entity entity){ Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/StructureBasedWordSenseDisambiguation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/StructureBasedWordSenseDisambiguation.java 2013-12-10 15:25:13 UTC (rev 4207) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/StructureBasedWordSenseDisambiguation.java 2013-12-10 15:41:36 UTC (rev 4208) @@ -3,21 +3,21 @@ */ package org.dllearner.algorithms.isle.wsd; -import java.io.IOException; -import java.util.Collection; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - +import com.google.common.base.Joiner; +import com.google.common.collect.Sets; import org.dllearner.algorithms.isle.StructuralEntityContext; import org.dllearner.algorithms.isle.VSMCosineDocumentSimilarity; import org.dllearner.algorithms.isle.index.Annotation; +import org.dllearner.algorithms.isle.index.EntityScorePair; import org.dllearner.algorithms.isle.index.SemanticAnnotation; import org.dllearner.core.owl.Entity; import org.semanticweb.owlapi.model.OWLOntology; -import com.google.common.base.Joiner; -import com.google.common.collect.Sets; +import java.io.IOException; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Set; /** * @author Lorenz Buehmann @@ -39,7 +39,7 @@ * @see org.dllearner.algorithms.isle.wsd.WordSenseDisambiguation#disambiguate(org.dllearner.algorithms.isle.index.Annotation, java.util.Set) */ @Override - public SemanticAnnotation disambiguate(Annotation annotation, Set<Entity> candidateEntities) { + public SemanticAnnotation disambiguate(Annotation annotation, Set<EntityScorePair> candidateEntities) { if(!candidateEntities.isEmpty()){ //get the context of the annotated token List<String> tokenContext = contextExtractor.extractContext(annotation); @@ -47,19 +47,20 @@ //compare this context with the context of each entity candidate double maxScore = Double.NEGATIVE_INFINITY; Entity bestEntity = null; - for (Entity entity : candidateEntities) { - //get the context of the entity by analyzing the structure of the ontology - Set<String> entityContext = StructuralEntityContext.getContextInNaturalLanguage(ontology, entity); - //compute the VSM Cosine Similarity - double score = computeScore(tokenContext, entityContext); - //set best entity - if(score > maxScore){ - maxScore = score; - bestEntity = entity; - } - } - - return new SemanticAnnotation(annotation, bestEntity); + for (EntityScorePair entityScorePair : candidateEntities) { + Entity entity = entityScorePair.getEntity(); + //get the context of the entity by analyzing the structure of the ontology + Set<String> entityContext = StructuralEntityContext.getContextInNaturalLanguage(ontology, entity); + //compute the VSM Cosine Similarity + double score = computeScore(tokenContext, entityContext); + //set best entity + if (score > maxScore) { + maxScore = score; + bestEntity = entity; + } + } + + return new SemanticAnnotation(annotation, bestEntity); } return null; } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WordSenseDisambiguation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WordSenseDisambiguation.java 2013-12-10 15:25:13 UTC (rev 4207) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/wsd/WordSenseDisambiguation.java 2013-12-10 15:41:36 UTC (rev 4208) @@ -1,12 +1,12 @@ package org.dllearner.algorithms.isle.wsd; -import java.util.Set; - import org.dllearner.algorithms.isle.index.Annotation; +import org.dllearner.algorithms.isle.index.EntityScorePair; import org.dllearner.algorithms.isle.index.SemanticAnnotation; -import org.dllearner.core.owl.Entity; import org.semanticweb.owlapi.model.OWLOntology; +import java.util.Set; + /** * Abstract class for the word sense disambiguation component. * @@ -27,9 +27,10 @@ /** * Chooses the correct entity for the given annotation from a set of candidate entities. * + * * @param annotation the annotation to find entity for * @param candidateEntities the set of candidate entities * @return semantic annotation containing the given annotation and the chosen entity */ - public abstract SemanticAnnotation disambiguate(Annotation annotation, Set<Entity> candidateEntities); + public abstract SemanticAnnotation disambiguate(Annotation annotation, Set<EntityScorePair> candidateEntities); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |