From: <dfl...@us...> - 2013-08-19 09:53:00
|
Revision: 4024 http://sourceforge.net/p/dl-learner/code/4024 Author: dfleischhacker Date: 2013-08-19 09:52:57 +0000 (Mon, 19 Aug 2013) Log Message: ----------- TR API: Document instead of String for documents Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LuceneSyntacticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/OWLOntologyLuceneSyntacticIndexCreator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleSemanticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SyntacticIndex.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Document.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Document.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Document.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Document.java 2013-08-19 09:52:57 UTC (rev 4024) @@ -0,0 +1,24 @@ +package org.dllearner.algorithms.isle.index; + +/** + * Interface for classes representing documents. + * + * @author Daniel Fleischhacker + */ +public interface Document { + /** + * Returns the cleaned content of this document represented as a string. This returns the cleaned content, + * thus markup and other structure is removed. The raw content can be retrieved using {@link #getRawContent}. + * Methods for retrieving more specialized content formats might be implemented by the actual implementations. + * + * @return this document's text content + */ + public String getContent(); + + /** + * Returns the uncleaned content, i.e., as originally retrieved, of this document represented as string. + * + * @return uncleaned content of this document + */ + public String getRawContent(); +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LuceneSyntacticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LuceneSyntacticIndex.java 2013-08-15 09:42:17 UTC (rev 4023) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LuceneSyntacticIndex.java 2013-08-19 09:52:57 UTC (rev 4024) @@ -3,11 +3,6 @@ */ package org.dllearner.algorithms.isle.index; -import java.io.File; -import java.io.IOException; -import java.util.HashSet; -import java.util.Set; - import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; @@ -22,6 +17,11 @@ import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; +import java.io.File; +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + /** * @author Lorenz Buehmann * @@ -41,26 +41,26 @@ parser = new QueryParser( Version.LUCENE_43, searchField, analyzer ); } - public LuceneSyntacticIndex(Directory directory, String seachField) throws Exception { - this(DirectoryReader.open(directory), seachField); + public LuceneSyntacticIndex(Directory directory, String searchField) throws Exception { + this(DirectoryReader.open(directory), searchField); } - public LuceneSyntacticIndex(String indexDirectory, String seachField) throws Exception { - this(DirectoryReader.open(FSDirectory.open(new File(indexDirectory))), seachField); + public LuceneSyntacticIndex(String indexDirectory, String searchField) throws Exception { + this(DirectoryReader.open(FSDirectory.open(new File(indexDirectory))), searchField); } /* (non-Javadoc) * @see org.dllearner.algorithms.isle.SyntacticIndex#getDocuments(java.lang.String) */ @Override - public Set<String> getDocuments(String searchString) { - Set<String> documents = new HashSet<String>(); + public Set<org.dllearner.algorithms.isle.index.Document> getDocuments(String searchString) { + Set<org.dllearner.algorithms.isle.index.Document> documents = new HashSet<org.dllearner.algorithms.isle.index.Document>(); try { Query query = parser.parse(searchString); ScoreDoc[] result = searcher.search(query, getSize()).scoreDocs; for (int i = 0; i < result.length; i++) { Document doc = searcher.doc(result[i].doc); - documents.add(doc.get(searchField)); + documents.add(new TextDocument(doc.get(searchField))); } } catch (ParseException e) { e.printStackTrace(); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/OWLOntologyLuceneSyntacticIndexCreator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/OWLOntologyLuceneSyntacticIndexCreator.java 2013-08-15 09:42:17 UTC (rev 4023) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/OWLOntologyLuceneSyntacticIndexCreator.java 2013-08-19 09:52:57 UTC (rev 4024) @@ -3,13 +3,8 @@ */ package org.dllearner.algorithms.isle.index; -import java.io.IOException; -import java.util.HashSet; -import java.util.Set; - import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.StringField; @@ -19,16 +14,14 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; -import org.semanticweb.owlapi.model.OWLAnnotation; -import org.semanticweb.owlapi.model.OWLAnnotationProperty; -import org.semanticweb.owlapi.model.OWLDataFactory; -import org.semanticweb.owlapi.model.OWLEntity; -import org.semanticweb.owlapi.model.OWLLiteral; -import org.semanticweb.owlapi.model.OWLOntology; +import org.semanticweb.owlapi.model.*; import org.semanticweb.owlapi.vocab.OWLRDFVocabulary; - import uk.ac.manchester.cs.owl.owlapi.OWLDataFactoryImpl; +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; + /** * Creates a Lucene Index for the labels if classes and properties. * @author Lorenz Buehmann @@ -61,8 +54,8 @@ IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer); IndexWriter writer = new IndexWriter(directory, indexWriterConfig); System.out.println( "Creating index ..." ); - - Set<Document> luceneDocuments = new HashSet<Document>(); + + Set<org.apache.lucene.document.Document> luceneDocuments = new HashSet<org.apache.lucene.document.Document>(); FieldType stringType = new FieldType(StringField.TYPE_STORED); stringType.setStoreTermVectors(false); FieldType textType = new FieldType(TextField.TYPE_STORED); @@ -81,7 +74,7 @@ } if(label != null){ - Document luceneDocument = new Document(); + org.apache.lucene.document.Document luceneDocument = new org.apache.lucene.document.Document(); luceneDocument.add(new Field("uri", entity.toStringID(), stringType)); luceneDocument.add(new Field(searchField, label, textType)); luceneDocuments.add(luceneDocument); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticIndex.java 2013-08-15 09:42:17 UTC (rev 4023) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticIndex.java 2013-08-19 09:52:57 UTC (rev 4024) @@ -1,35 +1,37 @@ -/** - * - */ package org.dllearner.algorithms.isle.index; +import org.dllearner.core.owl.Entity; + import java.util.Set; -import org.dllearner.core.owl.Entity; - /** - * This class + * Interface for an index which is able to resolve a given entity's URI to the set of documents containing + * this entity, i.e., documents which contain words disambiguated to the given entity. + * * @author Lorenz Buehmann - * + * @author Daniel Fleischhacker */ public interface SemanticIndex { + /** + * Returns the set of documents which reference the given entity using one of its surface forms. + * + * @param entity entity to retrieve documents + * @return documents referencing given entity + */ + public Set<Document> getDocuments(Entity entity); - /** - * This method returns a set of documents for the given entity. - * @param entity - * @return - */ - Set<String> getDocuments(Entity entity); - /** - * This method returns the number of documents for the given entity. - * @param entity - * @return - */ - int count(Entity entity); - /** - * This methods returns the total number of documents contained in the index. - * @return the total number of documents contained in the index - */ - int getSize(); + /** + * Returns the number of documents for the given entity. + * + * @param entity entity to return number of referencing documents for + * @return number of documents for the given entity in this index + */ + public int count(Entity entity); + /** + * Returns the total number of documents contained in the index. + * + * @return the total number of documents contained in the index + */ + public int getSize(); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleSemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleSemanticIndex.java 2013-08-15 09:42:17 UTC (rev 4023) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleSemanticIndex.java 2013-08-19 09:52:57 UTC (rev 4024) @@ -3,20 +3,20 @@ */ package org.dllearner.algorithms.isle.index; +import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever; +import org.dllearner.core.owl.Entity; +import org.semanticweb.owlapi.model.OWLOntology; + import java.util.HashSet; import java.util.Map; import java.util.Map.Entry; import java.util.Set; -import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever; -import org.dllearner.core.owl.Entity; -import org.semanticweb.owlapi.model.OWLOntology; - /** * @author Lorenz Buehmann * */ -public class SimpleSemanticIndex implements SemanticIndex{ +public class SimpleSemanticIndex implements SemanticIndex { private SyntacticIndex syntacticIndex; private RDFSLabelEntityTextRetriever labelRetriever; @@ -34,8 +34,8 @@ * @see org.dllearner.algorithms.isle.SemanticIndex#getDocuments(org.dllearner.core.owl.Entity) */ @Override - public Set<String> getDocuments(Entity entity) { - Set<String> documents = new HashSet<String>(); + public Set<Document> getDocuments(Entity entity) { + Set<Document> documents = new HashSet<Document>(); Map<String, Double> relevantText = labelRetriever.getRelevantText(entity); for (Entry<String, Double> entry : relevantText.entrySet()) { Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SyntacticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SyntacticIndex.java 2013-08-15 09:42:17 UTC (rev 4023) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SyntacticIndex.java 2013-08-19 09:52:57 UTC (rev 4024) @@ -1,32 +1,41 @@ /** - * + * */ package org.dllearner.algorithms.isle.index; import java.util.Set; /** + * Interface for a syntactic index, e.g., a basic string-based inverted index. + * * @author Lorenz Buehmann - * + * @author Daniel Fleischhacker */ public interface SyntacticIndex { - /** - * This method returns a set of documents based on how the underlying index is processing the given search string. - * @param searchString - * @return - */ - Set<String> getDocuments(String searchString); - /** - * This method returns the number of documents based on how the underlying index is processing the given search string. - * @param searchString - * @return - */ - int count(String searchString); - /** - * This methods returns the total number of documents contained in the index. - * @return the total number of documents contained in the index - */ - int getSize(); - + /** + * Returns a set of documents based on how the underlying index is processing the given + * search string. + * + * @param searchString query specifying the documents to retrieve + * @return set of documents retrieved based on the given query string + */ + Set<Document> getDocuments(String searchString); + + /** + * Returns the number of documents based on how the underlying index is processing the + * given search string. + * + * @param searchString query specifying the documents to include in the number of documents + * @return number of documents retrieved based on the given query string + */ + int count(String searchString); + + /** + * Returns the total number of documents contained in the index. + * + * @return the total number of documents contained in the index + */ + int getSize(); + } Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-08-19 09:52:57 UTC (rev 4024) @@ -0,0 +1,29 @@ +package org.dllearner.algorithms.isle.index; + +/** + * A simple text document without further formatting or markup. + * + * @author Daniel Fleischhacker + */ +public class TextDocument implements Document { + private String content; + + public TextDocument(String content) { + this.content = content; + } + + @Override + public String getContent() { + return content; + } + + /** + * The text content of this document. Returns the same data as {@link #getContent()}. + * + * @return text content of this document + */ + @Override + public String getRawContent() { + return content; + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-09-03 13:53:45
|
Revision: 4028 http://sourceforge.net/p/dl-learner/code/4028 Author: lorenz_b Date: 2013-09-03 13:53:43 +0000 (Tue, 03 Sep 2013) Log Message: ----------- Added annotation class and interface for annotated document. Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedDocument.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedDocument.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedDocument.java 2013-09-03 13:53:43 UTC (rev 4028) @@ -0,0 +1,36 @@ +/** + * + */ +package org.dllearner.algorithms.isle.index; + +import java.util.Set; + +import org.dllearner.core.owl.Entity; + +/** + * @author Lorenz Buehmann + * + */ +public interface AnnotatedDocument { + + /** + * Returns a set of entities which are contained in the document. + * @return + */ + Set<Entity> getContainedEntities(); + + /** + * Returns all annotations of the document. + * @return + */ + Set<Annotation> getAnnotations(); + + /** + * Returns the annotation at the given position(offset) of given length. + * @param offset + * @param length + * @return + */ + Annotation getAnnotation(int offset, int length); + +} Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java 2013-09-03 13:53:43 UTC (rev 4028) @@ -0,0 +1,81 @@ +/** + * + */ +package org.dllearner.algorithms.isle.index; + +import org.dllearner.core.owl.Entity; + +/** + * @author Lorenz Buehmann + * + */ +public class Annotation { + + private Document getReferencedDocument; + private Entity entity; + private int offset; + private int length; + + public Annotation(Document getReferencedDocument, Entity entity, int offset, int length) { + this.getReferencedDocument = getReferencedDocument; + this.entity = entity; + this.offset = offset; + this.length = length; + } + + public Document getGetReferencedDocument() { + return getReferencedDocument; + } + + public Entity getEntity() { + return entity; + } + + public int getOffset() { + return offset; + } + + public int getLength() { + return length; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((entity == null) ? 0 : entity.hashCode()); + result = prime * result + ((getReferencedDocument == null) ? 0 : getReferencedDocument.hashCode()); + result = prime * result + length; + result = prime * result + offset; + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + Annotation other = (Annotation) obj; + if (entity == null) { + if (other.entity != null) + return false; + } else if (!entity.equals(other.entity)) + return false; + if (getReferencedDocument == null) { + if (other.getReferencedDocument != null) + return false; + } else if (!getReferencedDocument.equals(other.getReferencedDocument)) + return false; + if (length != other.length) + return false; + if (offset != other.offset) + return false; + return true; + } + + + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-09-03 14:19:53
|
Revision: 4029 http://sourceforge.net/p/dl-learner/code/4029 Author: dfleischhacker Date: 2013-09-03 14:19:50 +0000 (Tue, 03 Sep 2013) Log Message: ----------- Refactor AnnotatedDocument and SemanticIndex Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedDocument.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedDocument.java 2013-09-03 13:53:43 UTC (rev 4028) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedDocument.java 2013-09-03 14:19:50 UTC (rev 4029) @@ -11,7 +11,7 @@ * @author Lorenz Buehmann * */ -public interface AnnotatedDocument { +public interface AnnotatedDocument extends Document { /** * Returns a set of entities which are contained in the document. @@ -33,4 +33,11 @@ */ Annotation getAnnotation(int offset, int length); + /** + * Returns the number of occurrences of the given entity in this document. + * + * @param entity the entity to get frequency for + * @return number of occurrences of given entity in this document + */ + int getEntityFrequency(Entity entity); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-09-03 13:53:43 UTC (rev 4028) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-09-03 14:19:50 UTC (rev 4029) @@ -1,5 +1,6 @@ package org.dllearner.algorithms.isle.index.semantic; +import org.dllearner.algorithms.isle.index.AnnotatedDocument; import org.dllearner.algorithms.isle.index.Document; import org.dllearner.core.owl.Entity; @@ -14,12 +15,12 @@ */ public interface SemanticIndex { /** - * Returns the set of documents which reference the given entity using one of its surface forms. + * Returns the set of annotated documents which reference the given entity using one of its surface forms. * * @param entity entity to retrieve documents * @return documents referencing given entity */ - public Set<Document> getDocuments(Entity entity); + public Set<AnnotatedDocument> getDocuments(Entity entity); /** * Returns the number of documents for the given entity. This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-09-03 14:28:03
|
Revision: 4030 http://sourceforge.net/p/dl-learner/code/4030 Author: lorenz_b Date: 2013-09-03 14:28:00 +0000 (Tue, 03 Sep 2013) Log Message: ----------- Implemented AnnotatedDocument interface. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedTextDocument.java Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedTextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedTextDocument.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedTextDocument.java 2013-09-03 14:28:00 UTC (rev 4030) @@ -0,0 +1,91 @@ +/** + * + */ +package org.dllearner.algorithms.isle.index; + +import java.util.HashSet; +import java.util.Set; + +import org.dllearner.core.owl.Entity; + +/** + * @author Lorenz Buehmann + * + */ +public class AnnotatedTextDocument implements AnnotatedDocument{ + + private TextDocument document; + private Set<Annotation> annotations; + private Set<Entity> entities; + + + public AnnotatedTextDocument(TextDocument document, Set<Annotation> annotations) { + this.document = document; + this.annotations = annotations; + + entities = new HashSet<Entity>(); + for (Annotation annotation : annotations) { + entities.add(annotation.getEntity()); + } + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Document#getContent() + */ + @Override + public String getContent() { + return document.getContent(); + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Document#getRawContent() + */ + @Override + public String getRawContent() { + return document.getRawContent(); + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.AnnotatedDocument#getContainedEntities() + */ + @Override + public Set<Entity> getContainedEntities() { + return entities; + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.AnnotatedDocument#getAnnotations() + */ + @Override + public Set<Annotation> getAnnotations() { + return annotations; + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.AnnotatedDocument#getAnnotation(int, int) + */ + @Override + public Annotation getAnnotation(int offset, int length) { + for (Annotation annotation : annotations) { + if(annotation.getOffset() == offset && annotation.getLength() == length){ + return annotation; + } + } + return null; + } + + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.AnnotatedDocument#getEntityFrequency(org.dllearner.core.owl.Entity) + */ + @Override + public int getEntityFrequency(Entity entity) { + int cnt = 0; + for (Annotation annotation : annotations) { + if(annotation.getEntity().equals(entity)){ + cnt++; + } + } + return cnt; + } + +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java 2013-09-03 14:19:50 UTC (rev 4029) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java 2013-09-03 14:28:00 UTC (rev 4030) @@ -3,6 +3,7 @@ */ package org.dllearner.algorithms.isle.index.semantic.simple; +import org.dllearner.algorithms.isle.index.AnnotatedDocument; import org.dllearner.algorithms.isle.index.Document; import org.dllearner.algorithms.isle.index.syntactic.SyntacticIndex; import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; @@ -41,7 +42,7 @@ * @see org.dllearner.algorithms.isle.SemanticIndex#getDocuments(org.dllearner.core.owl.Entity) */ @Override - public Set<Document> getDocuments(Entity entity) { + public Set<AnnotatedDocument> getDocuments(Entity entity) { Set<Document> documents = new HashSet<Document>(); Map<String, Double> relevantText = labelRetriever.getRelevantText(entity); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-09-03 15:16:27
|
Revision: 4033 http://sourceforge.net/p/dl-learner/code/4033 Author: lorenz_b Date: 2013-09-03 15:16:21 +0000 (Tue, 03 Sep 2013) Log Message: ----------- Added annotation. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedDocument.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedTextDocument.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedDocument.java 2013-09-03 14:43:56 UTC (rev 4032) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedDocument.java 2013-09-03 15:16:21 UTC (rev 4033) @@ -23,7 +23,7 @@ * Returns all annotations of the document. * @return */ - Set<Annotation> getAnnotations(); + Set<SemanticAnnotation> getAnnotations(); /** * Returns the annotation at the given position(offset) of given length. @@ -31,7 +31,7 @@ * @param length * @return */ - Annotation getAnnotation(int offset, int length); + SemanticAnnotation getAnnotation(int offset, int length); /** * Returns the number of occurrences of the given entity in this document. Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedTextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedTextDocument.java 2013-09-03 14:43:56 UTC (rev 4032) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedTextDocument.java 2013-09-03 15:16:21 UTC (rev 4033) @@ -15,16 +15,16 @@ public class AnnotatedTextDocument implements AnnotatedDocument{ private TextDocument document; - private Set<Annotation> annotations; + private Set<SemanticAnnotation> annotations; private Set<Entity> entities; - public AnnotatedTextDocument(TextDocument document, Set<Annotation> annotations) { + public AnnotatedTextDocument(TextDocument document, Set<SemanticAnnotation> annotations) { this.document = document; this.annotations = annotations; entities = new HashSet<Entity>(); - for (Annotation annotation : annotations) { + for (SemanticAnnotation annotation : annotations) { entities.add(annotation.getEntity()); } } @@ -57,7 +57,7 @@ * @see org.dllearner.algorithms.isle.index.AnnotatedDocument#getAnnotations() */ @Override - public Set<Annotation> getAnnotations() { + public Set<SemanticAnnotation> getAnnotations() { return annotations; } @@ -65,8 +65,8 @@ * @see org.dllearner.algorithms.isle.index.AnnotatedDocument#getAnnotation(int, int) */ @Override - public Annotation getAnnotation(int offset, int length) { - for (Annotation annotation : annotations) { + public SemanticAnnotation getAnnotation(int offset, int length) { + for (SemanticAnnotation annotation : annotations) { if(annotation.getOffset() == offset && annotation.getLength() == length){ return annotation; } @@ -80,7 +80,7 @@ @Override public int getEntityFrequency(Entity entity) { int cnt = 0; - for (Annotation annotation : annotations) { + for (SemanticAnnotation annotation : annotations) { if(annotation.getEntity().equals(entity)){ cnt++; } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java 2013-09-03 14:43:56 UTC (rev 4032) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java 2013-09-03 15:16:21 UTC (rev 4033) @@ -3,7 +3,6 @@ */ package org.dllearner.algorithms.isle.index; -import org.dllearner.core.owl.Entity; /** * @author Lorenz Buehmann @@ -12,13 +11,11 @@ public class Annotation { private Document getReferencedDocument; - private Entity entity; private int offset; private int length; - public Annotation(Document getReferencedDocument, Entity entity, int offset, int length) { + public Annotation(Document getReferencedDocument, int offset, int length) { this.getReferencedDocument = getReferencedDocument; - this.entity = entity; this.offset = offset; this.length = length; } @@ -27,10 +24,6 @@ return getReferencedDocument; } - public Entity getEntity() { - return entity; - } - public int getOffset() { return offset; } @@ -43,7 +36,6 @@ public int hashCode() { final int prime = 31; int result = 1; - result = prime * result + ((entity == null) ? 0 : entity.hashCode()); result = prime * result + ((getReferencedDocument == null) ? 0 : getReferencedDocument.hashCode()); result = prime * result + length; result = prime * result + offset; @@ -59,11 +51,6 @@ if (getClass() != obj.getClass()) return false; Annotation other = (Annotation) obj; - if (entity == null) { - if (other.entity != null) - return false; - } else if (!entity.equals(other.entity)) - return false; if (getReferencedDocument == null) { if (other.getReferencedDocument != null) return false; Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java 2013-09-03 14:43:56 UTC (rev 4032) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotator.java 2013-09-03 15:16:21 UTC (rev 4033) @@ -7,7 +7,8 @@ * * @author Daniel Fleischhacker */ -public abstract class SemanticAnnotator { +public class SemanticAnnotator { + OWLOntology ontology; /** @@ -25,5 +26,7 @@ * @param document the document to annotate * @return the given document extended with annotations */ - public abstract AnnotatedDocument processDocument(Document document); + public AnnotatedDocument processDocument(Document document){ + return null; + } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java 2013-09-03 14:43:56 UTC (rev 4032) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java 2013-09-03 15:16:21 UTC (rev 4033) @@ -43,7 +43,7 @@ */ @Override public Set<AnnotatedDocument> getDocuments(Entity entity) { - Set<Document> documents = new HashSet<Document>(); + Set<AnnotatedDocument> documents = new HashSet<AnnotatedDocument>(); Map<String, Double> relevantText = labelRetriever.getRelevantText(entity); for (Entry<String, Double> entry : relevantText.entrySet()) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-09-04 08:04:37
|
Revision: 4043 http://sourceforge.net/p/dl-learner/code/4043 Author: lorenz_b Date: 2013-09-04 08:04:33 +0000 (Wed, 04 Sep 2013) Log Message: ----------- Fixed bug in linguistic annotator. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedTextDocument.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotation.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedTextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedTextDocument.java 2013-09-03 16:53:18 UTC (rev 4042) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedTextDocument.java 2013-09-04 08:04:33 UTC (rev 4043) @@ -87,5 +87,13 @@ } return cnt; } + + /* (non-Javadoc) + * @see java.lang.Object#toString() + */ + @Override + public String toString() { + return "Document:\n" + document.getContent() + "\nAnnotations:" + annotations; + } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java 2013-09-03 16:53:18 UTC (rev 4042) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java 2013-09-04 08:04:33 UTC (rev 4043) @@ -63,6 +63,11 @@ return true; } - - + /* (non-Javadoc) + * @see java.lang.Object#toString() + */ + @Override + public String toString() { + return "\"" + getReferencedDocument.getContent().substring(offset, offset+length) + "\" at position " + offset; + } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotation.java 2013-09-03 16:53:18 UTC (rev 4042) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SemanticAnnotation.java 2013-09-04 08:04:33 UTC (rev 4043) @@ -52,7 +52,13 @@ return true; } - + /* (non-Javadoc) + * @see org.dllearner.algorithms.isle.index.Annotation#toString() + */ + @Override + public String toString() { + return super.toString() + "->" + entity; + } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java 2013-09-03 16:53:18 UTC (rev 4042) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java 2013-09-04 08:04:33 UTC (rev 4043) @@ -14,15 +14,20 @@ @Override public Set<Annotation> annotate(Document document) { - String s = document.getRawContent(); + String s = document.getRawContent().trim(); Set<Annotation> annotations = new HashSet<Annotation>(); - Pattern pattern = Pattern.compile(" "); + Pattern pattern = Pattern.compile("\\u0020+"); Matcher matcher = pattern.matcher(s); // Check all occurrences + int start = 0; while (matcher.find()) { - annotations.add(new Annotation(document, matcher.start(), - matcher.end() - matcher.start())); + int end = matcher.start(); + annotations.add(new Annotation(document, start, end - start)); + start = matcher.end(); } + if(start < s.length()-1){ + annotations.add(new Annotation(document, start, s.length() - start)); + } return annotations; } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-09-03 16:53:18 UTC (rev 4042) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-09-04 08:04:33 UTC (rev 4043) @@ -1,9 +1,11 @@ package org.dllearner.algorithms.isle.index.semantic; +import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; +import org.apache.lucene.document.Field; import org.dllearner.algorithms.isle.EntityCandidateGenerator; import org.dllearner.algorithms.isle.WordSenseDisambiguation; import org.dllearner.algorithms.isle.index.AnnotatedDocument; @@ -12,6 +14,10 @@ import org.dllearner.algorithms.isle.index.TextDocument; import org.dllearner.algorithms.isle.index.syntactic.SyntacticIndex; import org.dllearner.core.owl.Entity; +import org.semanticweb.owlapi.model.OWLAnnotation; +import org.semanticweb.owlapi.model.OWLAnnotationProperty; +import org.semanticweb.owlapi.model.OWLEntity; +import org.semanticweb.owlapi.model.OWLLiteral; import org.semanticweb.owlapi.model.OWLOntology; /** @@ -43,6 +49,7 @@ * Precompute the whole index, i.e. iterate over all entities and compute all annotated documents. */ public void buildIndex(Set<TextDocument> documents){ + index = new HashMap<Entity, Set<AnnotatedDocument>>(); for (TextDocument document : documents) { AnnotatedDocument annotatedDocument = semanticAnnotator.processDocument(document); for (Entity entity : annotatedDocument.getContainedEntities()) { @@ -56,6 +63,35 @@ } } + public void buildIndex(OWLAnnotationProperty annotationProperty, String language){ + Set<OWLEntity> schemaEntities = new HashSet<OWLEntity>(); + schemaEntities.addAll(ontology.getClassesInSignature()); + schemaEntities.addAll(ontology.getObjectPropertiesInSignature()); + schemaEntities.addAll(ontology.getDataPropertiesInSignature()); + Set<TextDocument> documents = new HashSet<TextDocument>(); + for (OWLEntity entity : schemaEntities) { + String label = null; + Set<OWLAnnotation> annotations = entity.getAnnotations(ontology, annotationProperty); + for (OWLAnnotation annotation : annotations) { + if (annotation.getValue() instanceof OWLLiteral) { + OWLLiteral val = (OWLLiteral) annotation.getValue(); + if (language != null) { + if(val.hasLang(language)){ + label = val.getLiteral(); + } + + } else { + label = val.getLiteral(); + } + } + } + if(label != null){ + documents.add(new TextDocument(label)); + } + } + buildIndex(documents); + } + /** * Returns the set of annotated documents which reference the given entity using one of its surface forms. * @@ -63,6 +99,11 @@ * @return documents referencing given entity */ public Set<AnnotatedDocument> getDocuments(Entity entity){ + if(index == null){ + System.err.println("You have to prebuild the index before you can use this method."); + System.exit(1); + } + Set<AnnotatedDocument> annotatedDocuments = index.get(entity); return annotatedDocuments; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-09-04 08:10:27
|
Revision: 4045 http://sourceforge.net/p/dl-learner/code/4045 Author: lorenz_b Date: 2013-09-04 08:10:22 +0000 (Wed, 04 Sep 2013) Log Message: ----------- Fixed bug in linguistic annotator. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedTextDocument.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedTextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedTextDocument.java 2013-09-04 08:10:08 UTC (rev 4044) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedTextDocument.java 2013-09-04 08:10:22 UTC (rev 4045) @@ -93,7 +93,7 @@ */ @Override public String toString() { - return "Document:\n" + document.getContent() + "\nAnnotations:" + annotations; + return "Document:" + document.getContent() + "\nAnnotations:" + annotations; } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-09-04 08:10:08 UTC (rev 4044) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-09-04 08:10:22 UTC (rev 4045) @@ -55,4 +55,12 @@ public int hashCode() { return content.hashCode(); } + + /* (non-Javadoc) + * @see java.lang.Object#toString() + */ + @Override + public String toString() { + return content; + } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-09-04 08:10:08 UTC (rev 4044) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-09-04 08:10:22 UTC (rev 4045) @@ -5,7 +5,7 @@ import java.util.Map; import java.util.Set; -import org.apache.lucene.document.Field; +import org.apache.log4j.Logger; import org.dllearner.algorithms.isle.EntityCandidateGenerator; import org.dllearner.algorithms.isle.WordSenseDisambiguation; import org.dllearner.algorithms.isle.index.AnnotatedDocument; @@ -29,6 +29,9 @@ */ public abstract class SemanticIndex { + + private static final Logger logger = Logger.getLogger(SemanticIndex.class.getName()); + private SemanticAnnotator semanticAnnotator; private SyntacticIndex syntacticIndex; private Map<Entity, Set<AnnotatedDocument>> index; @@ -49,8 +52,10 @@ * Precompute the whole index, i.e. iterate over all entities and compute all annotated documents. */ public void buildIndex(Set<TextDocument> documents){ + logger.info("Creating semantic index..."); index = new HashMap<Entity, Set<AnnotatedDocument>>(); for (TextDocument document : documents) { + logger.debug("Processing document:\n" + document); AnnotatedDocument annotatedDocument = semanticAnnotator.processDocument(document); for (Entity entity : annotatedDocument.getContainedEntities()) { Set<AnnotatedDocument> existingAnnotatedDocuments = index.get(entity); @@ -60,7 +65,8 @@ } existingAnnotatedDocuments.add(annotatedDocument); } - } + } + logger.info("...done."); } public void buildIndex(OWLAnnotationProperty annotationProperty, String language){ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-09-04 09:53:01
|
Revision: 4052 http://sourceforge.net/p/dl-learner/code/4052 Author: dfleischhacker Date: 2013-09-04 09:52:57 +0000 (Wed, 04 Sep 2013) Log Message: ----------- Add n-grams to annotations from SimpleLinguisticAnnotator Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NGramGeneratingAnnotator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NGramGeneratingAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NGramGeneratingAnnotator.java 2013-09-04 09:45:38 UTC (rev 4051) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NGramGeneratingAnnotator.java 2013-09-04 09:52:57 UTC (rev 4052) @@ -24,7 +24,6 @@ public Set<Annotation> annotate(Document document) { String text = document.getContent(); - Pattern legalChars = Pattern.compile("[A-Za-z]"); // clean up all texts Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java 2013-09-04 09:45:38 UTC (rev 4051) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java 2013-09-04 09:52:57 UTC (rev 4052) @@ -22,6 +22,7 @@ public class SimpleLinguisticAnnotator implements LinguisticAnnotator { private StopWordFilter stopWordFilter = new StopWordFilter(); + NGramGeneratingAnnotator nGramAnnotator = new NGramGeneratingAnnotator(2); @Override public Set<Annotation> annotate(Document document) { @@ -41,6 +42,7 @@ if(start < s.length()-1){ annotations.add(new Annotation(document, start, s.length() - start)); } + annotations.addAll(nGramAnnotator.annotate(document)); stopWordFilter.removeStopWordAnnotations(annotations); return annotations; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <and...@us...> - 2013-09-04 14:40:03
|
Revision: 4064 http://sourceforge.net/p/dl-learner/code/4064 Author: andremelo Date: 2013-09-04 14:39:59 +0000 (Wed, 04 Sep 2013) Log Message: ----------- Adding EntityCandidatesTries and the implementations of EntityCandidateGenerator and LinguisticAnnotator based on it Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java 2013-09-04 14:39:59 UTC (rev 4064) @@ -0,0 +1,41 @@ +package org.dllearner.algorithms.isle.index; + +import java.util.Map.Entry; +import java.util.Set; + +import org.dllearner.core.owl.Entity; + +public interface EntityCandidatesTrie { + + /** + * Adds an entry to the trie. If string already existent, adds to entity to its set of candidates + * @param s + * @param e + */ + public void addEntry(String s, Entity e); + + + /** + * Gets set of candidate entities for an exact given String + * @param s + * @return + */ + public Set<Entity> getCandidateEntities(String s); + + + /** + * Gets longest matching string and its candidate entities + * @param s + * @return + */ + public Entry<String,Set<Entity>> getLongestMatchWithCandidates(String s); + + /** + * Gets the longest matching string + * @param s + * @return + */ + public String getLongestMatch(String s); + + +} Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-09-04 14:39:59 UTC (rev 4064) @@ -0,0 +1,27 @@ +package org.dllearner.algorithms.isle.index; + +import java.util.Set; + +import org.dllearner.algorithms.isle.EntityCandidateGenerator; +import org.dllearner.core.owl.Entity; +import org.semanticweb.owlapi.model.OWLOntology; + +/** + * Generates candidates using a entity candidates prefix trie + * @author Andre Melo + * + */ +public class TrieEntityCandidateGenerator extends EntityCandidateGenerator{ + + EntityCandidatesTrie candidatesTrie; + + public TrieEntityCandidateGenerator(OWLOntology ontology, EntityCandidatesTrie candidatesTrie) { + super(ontology); + this.candidatesTrie = candidatesTrie; + } + + public Set<Entity> getCandidates(Annotation annotation) { + return candidatesTrie.getCandidateEntities(annotation.getToken()); + } + +} Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-09-04 14:39:59 UTC (rev 4064) @@ -0,0 +1,41 @@ +package org.dllearner.algorithms.isle.index; + +import java.util.HashSet; +import java.util.Set; + +/** + * Annotates a document using a prefix trie + * @author Andre Melo + * + */ +public class TrieLinguisticAnnotator implements LinguisticAnnotator { + + EntityCandidatesTrie candidatesTrie; + + public TrieLinguisticAnnotator(EntityCandidatesTrie candidatesTrie) { + this.candidatesTrie = candidatesTrie; + } + + /** + * Generates annotation based on trie's longest matching strings + * @param document + * @param candidatesTrie + * @return + */ + @Override + public Set<Annotation> annotate(Document document) { + String content = document.getRawContent(); + Set<Annotation> annotations = new HashSet<Annotation>(); + for (int i=0; i<content.length(); i++) { + String unparsed = content.substring(i); + String match = candidatesTrie.getLongestMatch(unparsed); + if (match!=null && !match.isEmpty()) { + Annotation annotation = new Annotation(document, i, match.length()); + annotations.add(annotation); + i += match.length()-1; + } + } + return annotations; + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <and...@us...> - 2013-09-04 15:22:11
|
Revision: 4066 http://sourceforge.net/p/dl-learner/code/4066 Author: andremelo Date: 2013-09-04 15:22:06 +0000 (Wed, 04 Sep 2013) Log Message: ----------- Adding SimpleEntityCandidates and updating EntityCandidateTrie Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java 2013-09-04 15:04:37 UTC (rev 4065) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java 2013-09-04 15:22:06 UTC (rev 4066) @@ -4,11 +4,12 @@ import java.util.Set; import org.dllearner.core.owl.Entity; +import org.dllearner.utilities.datastructures.PrefixTrie; public interface EntityCandidatesTrie { - + /** - * Adds an entry to the trie. If string already existent, adds to entity to its set of candidates + * Adds an entity to the set of candidates of a string * @param s * @param e */ @@ -24,13 +25,6 @@ /** - * Gets longest matching string and its candidate entities - * @param s - * @return - */ - public Entry<String,Set<Entity>> getLongestMatchWithCandidates(String s); - - /** * Gets the longest matching string * @param s * @return Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-09-04 15:22:06 UTC (rev 4066) @@ -0,0 +1,49 @@ +package org.dllearner.algorithms.isle.index; + +import java.util.HashSet; +import java.util.Map.Entry; +import java.util.Set; + +import org.dllearner.core.owl.Entity; +import org.dllearner.utilities.datastructures.PrefixTrie; +import org.semanticweb.owlapi.model.OWLOntology; + +public class SimpleEntityCandidatesTrie implements EntityCandidatesTrie { + + PrefixTrie<Set<Entity>> trie; + OWLOntology ontology; + + public SimpleEntityCandidatesTrie(OWLOntology ontology) { + this.ontology = ontology; + this.trie = new PrefixTrie<Set<Entity>>(); + } + + @Override + public void addEntry(String s, Entity e) { + Set<Entity> candidates = trie.get(s); + if (candidates==null) + candidates = new HashSet<Entity>(); + + candidates.add(e); + } + + @Override + public Set<Entity> getCandidateEntities(String s) { + // TODO Auto-generated method stub + return null; + } + + @Override + public String getLongestMatch(String s) { + return trie.getLongestMatch(s).toString(); + } + + /** + * @param args + */ + public static void main(String[] args) { + // TODO Auto-generated method stub + + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-09-04 16:10:06
|
Revision: 4072 http://sourceforge.net/p/dl-learner/code/4072 Author: dfleischhacker Date: 2013-09-04 16:10:03 +0000 (Wed, 04 Sep 2013) Log Message: ----------- Use trie Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-09-04 16:04:20 UTC (rev 4071) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-09-04 16:10:03 UTC (rev 4072) @@ -2,15 +2,10 @@ import edu.northwestern.at.utils.corpuslinguistics.lemmatizer.DefaultLemmatizer; import edu.northwestern.at.utils.corpuslinguistics.lemmatizer.Lemmatizer; -import edu.stanford.nlp.ling.CoreAnnotations; -import edu.stanford.nlp.ling.CoreLabel; -import edu.stanford.nlp.pipeline.*; -import edu.stanford.nlp.util.CoreMap; import net.didion.jwnl.data.POS; import org.dllearner.algorithms.isle.WordNet; import java.util.ArrayList; -import java.util.Properties; /** * Provides shortcuts to commonly used linguistic operations Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java 2013-09-04 16:04:20 UTC (rev 4071) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java 2013-09-04 16:10:03 UTC (rev 4072) @@ -6,9 +6,13 @@ import org.dllearner.algorithms.isle.RandomWordSenseDisambiguation; import org.dllearner.algorithms.isle.SimpleWordSenseDisambiguation; import org.dllearner.algorithms.isle.index.SimpleEntityCandidateGenerator; +import org.dllearner.algorithms.isle.index.SimpleEntityCandidatesTrie; import org.dllearner.algorithms.isle.index.SimpleLinguisticAnnotator; +import org.dllearner.algorithms.isle.index.TrieEntityCandidateGenerator; import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; import org.dllearner.algorithms.isle.index.syntactic.SyntacticIndex; +import org.dllearner.algorithms.isle.textretrieval.AnnotationEntityTextRetriever; +import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever; import org.semanticweb.owlapi.model.OWLOntology; /** @@ -27,11 +31,12 @@ * @param syntacticIndex index to query for documents containing the labels */ public SimpleSemanticIndex(OWLOntology ontology, SyntacticIndex syntacticIndex) { - super(ontology, - syntacticIndex, - new SimpleWordSenseDisambiguation(ontology), - new SimpleEntityCandidateGenerator(ontology), - new SimpleLinguisticAnnotator()); + super(ontology, + syntacticIndex, + new SimpleWordSenseDisambiguation(ontology), + new TrieEntityCandidateGenerator(ontology, new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology))), + new SimpleLinguisticAnnotator()); + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-09-04 16:14:42
|
Revision: 4073 http://sourceforge.net/p/dl-learner/code/4073 Author: lorenz_b Date: 2013-09-04 16:14:38 +0000 (Wed, 04 Sep 2013) Log Message: ----------- Added text normalization. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java 2013-09-04 16:10:03 UTC (rev 4072) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java 2013-09-04 16:14:38 UTC (rev 4073) @@ -43,7 +43,7 @@ annotations.add(new Annotation(document, start, s.length() - start)); } annotations.addAll(nGramAnnotator.annotate(document)); - stopWordFilter.removeStopWordAnnotations(annotations); +// stopWordFilter.removeStopWordAnnotations(annotations); return annotations; } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java 2013-09-04 16:10:03 UTC (rev 4072) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java 2013-09-04 16:14:38 UTC (rev 4073) @@ -3,7 +3,6 @@ */ package org.dllearner.algorithms.isle.index.semantic.simple; -import org.dllearner.algorithms.isle.RandomWordSenseDisambiguation; import org.dllearner.algorithms.isle.SimpleWordSenseDisambiguation; import org.dllearner.algorithms.isle.index.SimpleEntityCandidateGenerator; import org.dllearner.algorithms.isle.index.SimpleEntityCandidatesTrie; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-09-04 16:30:31
|
Revision: 4077 http://sourceforge.net/p/dl-learner/code/4077 Author: dfleischhacker Date: 2013-09-04 16:30:27 +0000 (Wed, 04 Sep 2013) Log Message: ----------- Fix usage of raw contents from documents Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java 2013-09-04 16:26:24 UTC (rev 4076) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleLinguisticAnnotator.java 2013-09-04 16:30:27 UTC (rev 4077) @@ -26,7 +26,7 @@ @Override public Set<Annotation> annotate(Document document) { - String s = document.getRawContent().trim(); + String s = document.getContent().trim(); System.out.println("Document:" + s); // s = stopWordFilter.removeStopWords(s); Set<Annotation> annotations = new HashSet<Annotation>(); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-09-04 16:26:24 UTC (rev 4076) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-09-04 16:30:27 UTC (rev 4077) @@ -19,12 +19,11 @@ /** * Generates annotation based on trie's longest matching strings * @param document - * @param candidatesTrie * @return */ @Override public Set<Annotation> annotate(Document document) { - String content = document.getRawContent(); + String content = document.getContent(); Set<Annotation> annotations = new HashSet<Annotation>(); for (int i=0; i<content.length(); i++) { String unparsed = content.substring(i); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java 2013-09-04 16:26:24 UTC (rev 4076) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java 2013-09-04 16:30:27 UTC (rev 4077) @@ -33,7 +33,7 @@ super(ontology, syntacticIndex, new SimpleWordSenseDisambiguation(ontology), - new TrieEntityCandidateGenerator(ontology, new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology))), + new TrieEntityCandidateGenerator(ontology, new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), ontology)), new SimpleLinguisticAnnotator()); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-09-09 10:11:44
|
Revision: 4098 http://sourceforge.net/p/dl-learner/code/4098 Author: dfleischhacker Date: 2013-09-09 10:11:41 +0000 (Mon, 09 Sep 2013) Log Message: ----------- Improve some documentation Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticAnnotator.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java 2013-09-09 10:10:55 UTC (rev 4097) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java 2013-09-09 10:11:41 UTC (rev 4098) @@ -5,6 +5,7 @@ /** + * A (non-semantic) annotation which represents an entity in a document by its offset and length. * @author Lorenz Buehmann * */ Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticAnnotator.java 2013-09-09 10:10:55 UTC (rev 4097) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticAnnotator.java 2013-09-09 10:11:41 UTC (rev 4098) @@ -6,11 +6,15 @@ import java.util.Set; /** + * Interface for generating (non-semantic) annotations for documents. * @author Lorenz Buehmann - * */ public interface LinguisticAnnotator { - + /** + * Returns the set of annotation for the given document. + * @param document the document to get annotation for + * @return set of annotations for the given document + */ Set<Annotation> annotate(Document document); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-10-07 09:15:23
|
Revision: 4120 http://sourceforge.net/p/dl-learner/code/4120 Author: dfleischhacker Date: 2013-10-07 09:15:20 +0000 (Mon, 07 Oct 2013) Log Message: ----------- Fix bug leading to out of bounds exception Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/StanfordLemmatizer.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-10-07 07:38:17 UTC (rev 4119) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/LinguisticUtil.java 2013-10-07 09:15:20 UTC (rev 4120) @@ -137,7 +137,7 @@ else { res.append(" "); } - res.append(lemmatizeSingleWord(word)); + res.append(lemmatizeSingleWord(w)); } catch (Exception e) { throw new RuntimeException(e); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-10-07 07:38:17 UTC (rev 4119) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-10-07 09:15:20 UTC (rev 4120) @@ -149,7 +149,8 @@ @Override public Set<Entity> getCandidateEntities(String s) { - return trie.get(s); + Set<Entity> res = trie.get(s); + return res == null ? new HashSet<Entity>() : trie.get(s); } @Override @@ -263,4 +264,34 @@ return res; } } + + /** + * Pair of the actual word and the word after processing. + */ + public static class ActualModifiedWordPair { + private String actualString; + private String modifiedString; + + public String getActualString() { + return actualString; + } + + public void setActualString(String actualString) { + this.actualString = actualString; + } + + public String getModifiedString() { + return modifiedString; + } + + public void setModifiedString(String modifiedString) { + this.modifiedString = modifiedString; + } + + public ActualModifiedWordPair(String actualString, String modifiedString) { + + this.actualString = actualString; + this.modifiedString = modifiedString; + } + } } Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/StanfordLemmatizer.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/StanfordLemmatizer.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/StanfordLemmatizer.java 2013-10-07 09:15:20 UTC (rev 4120) @@ -0,0 +1,54 @@ +package org.dllearner.algorithms.isle.index; + +import edu.stanford.nlp.ling.CoreAnnotations; +import edu.stanford.nlp.ling.CoreLabel; +import edu.stanford.nlp.pipeline.StanfordCoreNLP; +import edu.stanford.nlp.util.CoreMap; + +import java.util.LinkedList; +import java.util.List; +import java.util.Properties; + +/** + * + */ +class StanfordLemmatizer { + + protected StanfordCoreNLP pipeline; + + public StanfordLemmatizer() { + // Create StanfordCoreNLP object properties, with POS tagging + // (required for lemmatization), and lemmatization + Properties props; + props = new Properties(); + props.put("annotators", "tokenize, ssplit, pos, lemma"); + + // StanfordCoreNLP loads a lot of models, so you probably + // only want to do this once per execution + this.pipeline = new StanfordCoreNLP(props); + } + + public String lemmatize(String documentText) + { + List<String> lemmas = new LinkedList<String>(); + + // create an empty Annotation just with the given text + edu.stanford.nlp.pipeline.Annotation document = new edu.stanford.nlp.pipeline.Annotation(documentText); + + // run all Annotators on this text + this.pipeline.annotate(document); + + // Iterate over all of the sentences found + List<CoreMap> sentences = document.get(CoreAnnotations.SentencesAnnotation.class); + for(CoreMap sentence: sentences) { + // Iterate over all tokens in a sentence + for (CoreLabel token: sentence.get(CoreAnnotations.TokensAnnotation.class)) { + // Retrieve and add the lemma for each word into the + // list of lemmas + lemmas.add(token.get(CoreAnnotations.LemmaAnnotation.class)); + } + } + + return lemmas.get(0); + } +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-10-07 07:38:17 UTC (rev 4119) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-10-07 09:15:20 UTC (rev 4120) @@ -37,6 +37,8 @@ } String match = candidatesTrie.getLongestMatch(unparsed); if (match != null && !match.isEmpty()) { + + //TODO: here we are losing the original offset and index... Annotation annotation = new Annotation(document, i, match.length()); annotations.add(annotation); i += match.length() - 1; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-10-24 13:56:30
|
Revision: 4130 http://sourceforge.net/p/dl-learner/code/4130 Author: dfleischhacker Date: 2013-10-24 13:56:26 +0000 (Thu, 24 Oct 2013) Log Message: ----------- Get the ISLE pipeline working * Ability to resolve match in trie to the producing string (the pre-wordnet one) * Add NormalizedTextMapper for mapping normalized words to their original documents * Activate structure based WSD Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/FullTokenEntitySetPair.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NormalizedTextMapper.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java 2013-10-24 13:47:58 UTC (rev 4129) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java 2013-10-24 13:56:26 UTC (rev 4130) @@ -1,11 +1,9 @@ package org.dllearner.algorithms.isle.index; -import java.util.Map.Entry; +import org.dllearner.core.owl.Entity; + import java.util.Set; -import org.dllearner.core.owl.Entity; -import org.dllearner.utilities.datastructures.PrefixTrie; - public interface EntityCandidatesTrie { /** @@ -22,14 +20,22 @@ * @return */ public Set<Entity> getCandidateEntities(String s); - - + + /** - * Gets the longest matching string - * @param s - * @return + * Returns the string on which this entry is based on. This is used e.g. for storing the original + * ontology string when the parameter string has been added to the trie after generation by using + * WordNet or other additional methods. + * + * @param s the string to search in the trie + * @return string generating the path of the longest match in the trie */ - public String getLongestMatch(String s); - - + public String getGeneratingStringForLongestMatch(String s); + + /** + * Gets the longest matching string + * @param s + * @return + */ + public String getLongestMatchingText(String s); } Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/FullTokenEntitySetPair.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/FullTokenEntitySetPair.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/FullTokenEntitySetPair.java 2013-10-24 13:56:26 UTC (rev 4130) @@ -0,0 +1,31 @@ +package org.dllearner.algorithms.isle.index; + +import org.dllearner.core.owl.Entity; + +import java.util.HashSet; +import java.util.Set; + +/** + * A pair consisting of a full string token and the corresponding entities + */ +public class FullTokenEntitySetPair { + private String fullToken; + private Set<Entity> entitySet; + + public FullTokenEntitySetPair(String fullToken) { + this.fullToken = fullToken; + this.entitySet = new HashSet<Entity>(); + } + + public String getFullToken() { + return fullToken; + } + + public Set<Entity> getEntitySet() { + return entitySet; + } + + public void addEntity(Entity entity) { + entitySet.add(entity); + } +} Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NormalizedTextMapper.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NormalizedTextMapper.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NormalizedTextMapper.java 2013-10-24 13:56:26 UTC (rev 4130) @@ -0,0 +1,140 @@ +package org.dllearner.algorithms.isle.index; + +import java.util.ArrayList; + +/** + * Provides text normalization and mapping of normalized ranges to the original ones. + */ +public class NormalizedTextMapper { + private Document originalDocument; + private String originalText; + private String normalizedText; + + private ArrayList<OccurenceMappingPair> normalizedIndexToOriginalIndex; + + public NormalizedTextMapper(Document original) { + this.originalDocument = original; + this.originalText = original.getContent(); + this.normalizedIndexToOriginalIndex = new ArrayList<OccurenceMappingPair>(); + + StringBuilder sb = new StringBuilder(); + int currentOriginalIndex = 0; + for (String originalWord : originalText.split(" ")) { + String normalizedWord = getNormalizedWord(originalWord); + normalizedIndexToOriginalIndex + .add(new OccurenceMappingPair(currentOriginalIndex, originalWord.length(), sb.length(), + normalizedWord.length())); + currentOriginalIndex += originalWord.length() + 1; + sb.append(normalizedWord); + sb.append(" "); + } + normalizedText = sb.toString(); + } + + public String getOriginalText() { + return originalText; + } + + public String getNormalizedText() { + return normalizedText; + } + + /** + * Returns the annotation for the original text matching the given position and length in the normalized + * text. + * + * @param position position in the normalized text to get annotation for + * @param length length of the text to get annotation for + * @return + */ + public Annotation getOriginalAnnotationForPosition(int position, int length) { + int curNormalizedLength = 0; + int originalStart = -1; + int curOriginalLength = 0; + + for (OccurenceMappingPair p : normalizedIndexToOriginalIndex) { + if (p.getNormalizedIndex() == position) { + originalStart = p.getOriginalIndex(); + } + if (originalStart != -1) { + curNormalizedLength += p.getNormalizedLength(); + curOriginalLength += p.getOriginalLength(); + if (curNormalizedLength >= length) { + return new Annotation(originalDocument, originalStart, curOriginalLength); + } + + // include space + curNormalizedLength += 1; + curOriginalLength += 1; + } + } + + return null; + } + + /** + * Returns the normalized form of the given word. Word must not contain any spaces or the like. + * @param word + * @return + */ + private String getNormalizedWord(String word) { + return LinguisticUtil.getInstance().getNormalizedForm(word); + } + + public static void main(String[] args) { + NormalizedTextMapper n = new NormalizedTextMapper(new TextDocument("This is a testing text using letters")); + System.out.println(n.getOriginalText()); + System.out.println(n.getNormalizedText()); + for (OccurenceMappingPair p : n.normalizedIndexToOriginalIndex) { + System.out.println(p); + } + System.out.println(n.getOriginalAnnotationForPosition(7,6)); + System.out.println(n.getOriginalAnnotationForPosition(23,6)); + System.out.println(n.getOriginalAnnotationForPosition(7,1)); + System.out.println(n.getOriginalAnnotationForPosition(14,15)); + } + + /** + * Maps words identified by index and length in the normalized texts to the original word. + */ + private class OccurenceMappingPair { + private int originalIndex; + private int originalLength; + private int normalizedIndex; + private int normalizedLength; + + private OccurenceMappingPair(int originalIndex, int originalLength, int normalizedIndex, int normalizedLength) { + + this.originalIndex = originalIndex; + this.originalLength = originalLength; + this.normalizedIndex = normalizedIndex; + this.normalizedLength = normalizedLength; + } + + private int getNormalizedIndex() { + return normalizedIndex; + } + + private int getNormalizedLength() { + return normalizedLength; + } + + private int getOriginalLength() { + return originalLength; + } + + private int getOriginalIndex() { + return originalIndex; + } + + @Override + public String toString() { + return "OccurenceMappingPair{" + + "originalIndex=" + originalIndex + + ", originalLength=" + originalLength + + ", normalizedIndex=" + normalizedIndex + + ", normalizedLength=" + normalizedLength + + '}'; + } + } +} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-10-24 13:47:58 UTC (rev 4129) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-10-24 13:56:26 UTC (rev 4130) @@ -10,7 +10,7 @@ public class SimpleEntityCandidatesTrie implements EntityCandidatesTrie { - PrefixTrie<Set<Entity>> trie; + PrefixTrie<FullTokenEntitySetPair> trie; EntityTextRetriever entityTextRetriever; // /** @@ -39,7 +39,7 @@ } public void buildTrie(OWLOntology ontology, NameGenerator nameGenerator) { - this.trie = new PrefixTrie<Set<Entity>>(); + this.trie = new PrefixTrie<FullTokenEntitySetPair>(); Map<Entity, Set<String>> relevantText = entityTextRetriever.getRelevantText(ontology); for (Entity entity : relevantText.keySet()) { @@ -55,7 +55,7 @@ addSubsequencesWordNet(entity, text); for (String alternativeText : nameGenerator.getAlternativeText(text)) { - addEntry(alternativeText, entity); + addEntry(alternativeText, entity, text); } } } @@ -136,37 +136,55 @@ @Override public void addEntry(String s, Entity e) { - Set<Entity> candidates; + FullTokenEntitySetPair candidates; if (trie.contains(s)) candidates = trie.get(s); else - candidates = new HashSet<Entity>(); + candidates = new FullTokenEntitySetPair(s); - candidates.add(e); + candidates.addEntity(e); trie.put(s, candidates); } + public void addEntry(String s, Entity e, String originalString) { + FullTokenEntitySetPair candidates; + if (trie.contains(s)) + candidates = trie.get(s); + else + candidates = new FullTokenEntitySetPair(originalString); + + candidates.addEntity(e); + + trie.put(s, candidates); + } + @Override public Set<Entity> getCandidateEntities(String s) { - Set<Entity> res = trie.get(s); - return res == null ? new HashSet<Entity>() : trie.get(s); + FullTokenEntitySetPair res = trie.get(s); + return res == null ? new HashSet<Entity>() : trie.get(s).getEntitySet(); } @Override - public String getLongestMatch(String s) { + public String getGeneratingStringForLongestMatch(String s) { CharSequence match = trie.getLongestMatch(s); - return (match!=null) ? match.toString() : null; + return (match!=null) ? trie.get(match).getFullToken() : null; } + + @Override + public String getLongestMatchingText(String s) { + CharSequence match = trie.getLongestMatch(s); + return (match!=null) ? match.toString() : null; + } public String toString() { String output = ""; - Map<String,Set<Entity>> trieMap = trie.toMap(); + Map<String,FullTokenEntitySetPair> trieMap = trie.toMap(); List<String> termsList = new ArrayList<String>(trieMap.keySet()); Collections.sort(termsList); for (String key : termsList) { output += key + ":\n"; - for (Entity candidate: trieMap.get(key)) { + for (Entity candidate: trieMap.get(key).getEntitySet()) { output += "\t"+candidate+"\n"; } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-10-24 13:47:58 UTC (rev 4129) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-10-24 13:56:26 UTC (rev 4130) @@ -25,24 +25,23 @@ */ @Override public Set<Annotation> annotate(Document document) { - String content = document.getContent(); Set<Annotation> annotations = new HashSet<Annotation>(); + NormalizedTextMapper mapper = new NormalizedTextMapper(document); + String content = mapper.getNormalizedText(); for (int i = 0; i < content.length(); i++) { if (Character.isWhitespace(content.charAt(i))) { continue; } String unparsed = content.substring(i); - if (normalizeWords) { - unparsed = LinguisticUtil.getInstance().getNormalizedForm(unparsed); - } - String match = candidatesTrie.getLongestMatch(unparsed); + String match = candidatesTrie.getLongestMatchingText(unparsed); if (match != null && !match.isEmpty()) { - - //TODO: here we are losing the original offset and index... - Annotation annotation = new Annotation(document, i, match.length()); + Annotation annotation = mapper.getOriginalAnnotationForPosition(i, match.length()); annotations.add(annotation); i += match.length() - 1; } + while (!Character.isWhitespace(content.charAt(i)) && i < content.length()) { + i++; + } } return annotations; } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java 2013-10-24 13:47:58 UTC (rev 4129) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java 2013-10-24 13:56:26 UTC (rev 4130) @@ -10,7 +10,8 @@ import org.dllearner.algorithms.isle.index.semantic.SemanticIndex; import org.dllearner.algorithms.isle.index.syntactic.SyntacticIndex; import org.dllearner.algorithms.isle.textretrieval.RDFSLabelEntityTextRetriever; -import org.dllearner.algorithms.isle.wsd.SimpleWordSenseDisambiguation; +import org.dllearner.algorithms.isle.wsd.StructureBasedWordSenseDisambiguation; +import org.dllearner.algorithms.isle.wsd.WindowBasedContextExtractor; import org.semanticweb.owlapi.model.OWLOntology; /** @@ -56,7 +57,7 @@ TrieLinguisticAnnotator linguisticAnnotator = new TrieLinguisticAnnotator(trie); linguisticAnnotator.setNormalizeWords(useWordNormalization); setSemanticAnnotator(new SemanticAnnotator( - new SimpleWordSenseDisambiguation(ontology), + new StructureBasedWordSenseDisambiguation(new WindowBasedContextExtractor(), ontology), new TrieEntityCandidateGenerator(ontology, trie), linguisticAnnotator)); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-10-29 14:11:10
|
Revision: 4133 http://sourceforge.net/p/dl-learner/code/4133 Author: dfleischhacker Date: 2013-10-29 14:11:07 +0000 (Tue, 29 Oct 2013) Log Message: ----------- Bug searching Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java 2013-10-29 13:23:45 UTC (rev 4132) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Annotation.java 2013-10-29 14:11:07 UTC (rev 4133) @@ -14,8 +14,17 @@ private Document referencedDocument; private int offset; private int length; - - public Annotation(Document referencedDocument, int offset, int length) { + private String matchedString; + + public String getMatchedString() { + return matchedString; + } + + public void setMatchedString(String matchedString) { + this.matchedString = matchedString; + } + + public Annotation(Document referencedDocument, int offset, int length) { this.referencedDocument = referencedDocument; this.offset = offset; this.length = length; Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-10-29 13:23:45 UTC (rev 4132) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-10-29 14:11:07 UTC (rev 4133) @@ -1,20 +1,13 @@ package org.dllearner.algorithms.isle.index; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import java.util.regex.Pattern; - +import com.google.common.collect.Sets; import org.dllearner.algorithms.isle.EntityCandidateGenerator; import org.dllearner.algorithms.isle.StopWordFilter; import org.dllearner.core.owl.Entity; import org.semanticweb.owlapi.model.OWLOntology; -import com.google.common.collect.Sets; +import java.util.*; +import java.util.regex.Pattern; /** * Generates candidates using a entity candidates prefix trie @@ -33,7 +26,7 @@ } public Set<Entity> getCandidates(Annotation annotation) { - return candidatesTrie.getCandidateEntities(annotation.getToken()); + return candidatesTrie.getCandidateEntities(annotation.getMatchedString()); } /** @@ -131,7 +124,7 @@ for (Annotation annotation: annotations) candidatesMap.put(annotation, getCandidates(annotation)); - postProcess(candidatesMap, window, stopWordFilter); + //postProcess(candidatesMap, window, stopWordFilter); return candidatesMap; } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-10-29 13:23:45 UTC (rev 4132) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-10-29 14:11:07 UTC (rev 4133) @@ -36,6 +36,7 @@ String match = candidatesTrie.getLongestMatchingText(unparsed); if (match != null && !match.isEmpty()) { Annotation annotation = mapper.getOriginalAnnotationForPosition(i, match.length()); + annotation.setMatchedString(match); annotations.add(annotation); i += match.length() - 1; } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java 2013-10-29 13:23:45 UTC (rev 4132) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/simple/SimpleSemanticIndex.java 2013-10-29 14:11:07 UTC (rev 4133) @@ -53,7 +53,7 @@ trie = new SimpleEntityCandidatesTrie(new RDFSLabelEntityTextRetriever(ontology), ontology, new SimpleEntityCandidatesTrie.DummyNameGenerator()); } -// trie.printTrie(); + trie.printTrie(); TrieLinguisticAnnotator linguisticAnnotator = new TrieLinguisticAnnotator(trie); linguisticAnnotator.setNormalizeWords(useWordNormalization); setSemanticAnnotator(new SemanticAnnotator( This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-11-08 09:28:20
|
Revision: 4140 http://sourceforge.net/p/dl-learner/code/4140 Author: lorenz_b Date: 2013-11-08 09:28:17 +0000 (Fri, 08 Nov 2013) Log Message: ----------- Decreased log level. Small improvements in toString of trie.. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-10-29 15:08:54 UTC (rev 4139) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-11-08 09:28:17 UTC (rev 4140) @@ -3,10 +3,12 @@ import org.apache.commons.lang.StringUtils; import org.dllearner.algorithms.isle.textretrieval.EntityTextRetriever; import org.dllearner.core.owl.Entity; +import org.dllearner.utilities.MapUtils; import org.dllearner.utilities.datastructures.PrefixTrie; import org.semanticweb.owlapi.model.OWLOntology; import java.util.*; +import java.util.Map.Entry; public class SimpleEntityCandidatesTrie implements EntityCandidatesTrie { @@ -50,6 +52,7 @@ if (text.trim().isEmpty()) { continue; } + text = text.trim(); addEntry(text, entity); addSubsequencesWordNet(entity, text); @@ -59,6 +62,7 @@ } } } + } /** @@ -188,17 +192,18 @@ } public String toString() { - String output = ""; + StringBuilder output = new StringBuilder(); Map<String,FullTokenEntitySetPair> trieMap = trie.toMap(); - List<String> termsList = new ArrayList<String>(trieMap.keySet()); - Collections.sort(termsList); - for (String key : termsList) { - output += key + " (" + trieMap.get(key).getFullToken() + ") :\n"; - for (Entity candidate: trieMap.get(key).getEntitySet()) { - output += "\t"+candidate+"\n"; + + for (Entry<String, FullTokenEntitySetPair> entry : trieMap.entrySet()) { + String key = entry.getKey(); + FullTokenEntitySetPair pair = entry.getValue(); + output.append(key + " (" + pair.getFullToken() + ") :\n"); + for (Entity candidate: pair.getEntitySet()) { + output.append("\t"+candidate+"\n"); } } - return output; + return output.toString(); } public static void main(String[] args) { Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-10-29 15:08:54 UTC (rev 4139) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/semantic/SemanticIndex.java 2013-11-08 09:28:17 UTC (rev 4140) @@ -67,7 +67,7 @@ logger.info("Creating semantic index..."); index = new HashMap<Entity, Set<AnnotatedDocument>>(); for (TextDocument document : documents) { - logger.info("Processing document:\n" + document); + logger.debug("Processing document:" + document); AnnotatedDocument annotatedDocument = semanticAnnotator.processDocument(document); for (Entity entity : annotatedDocument.getContainedEntities()) { Set<AnnotatedDocument> existingAnnotatedDocuments = index.get(entity); @@ -77,7 +77,7 @@ } existingAnnotatedDocuments.add(annotatedDocument); } - logger.info("Annotated document:" + annotatedDocument); + logger.debug("Annotated document:" + annotatedDocument); } size = documents.size(); logger.info("...done."); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java 2013-10-29 15:08:54 UTC (rev 4139) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/LuceneSyntacticIndex.java 2013-11-08 09:28:17 UTC (rev 4140) @@ -78,6 +78,20 @@ public int getSize() { return indexReader.numDocs(); } + + public Set<TextDocument> getAllDocuments(){ + Set<TextDocument> documents = new HashSet<TextDocument>(indexReader.numDocs()); + for (int i = 0; i < indexReader.numDocs(); i++) { + try { + Document doc = indexReader.document(i); + String content = doc.get(searchField); + documents.add(new TextDocument(content)); + } catch (IOException e) { + e.printStackTrace(); + } + } + return documents; + } /* (non-Javadoc) * @see org.dllearner.algorithms.isle.SyntacticIndex#count(java.lang.String) Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java 2013-10-29 15:08:54 UTC (rev 4139) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/syntactic/TextDocumentSyntacticIndexCreator.java 2013-11-08 09:28:17 UTC (rev 4140) @@ -14,6 +14,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.util.Version; +import org.dllearner.algorithms.isle.index.TextDocument; import java.io.BufferedReader; import java.io.File; @@ -77,14 +78,41 @@ return new LuceneSyntacticIndex(indexDirectory, searchField); } + + public SyntacticIndex buildIndex(Set<TextDocument> documents) throws Exception{ + Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); + IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_43, analyzer); + IndexWriter writer = new IndexWriter(indexDirectory, indexWriterConfig); + System.out.println( "Creating index ..." ); + Set<org.apache.lucene.document.Document> luceneDocuments = new HashSet<org.apache.lucene.document.Document>(); + FieldType stringType = new FieldType(StringField.TYPE_STORED); + stringType.setStoreTermVectors(false); + FieldType textType = new FieldType(TextField.TYPE_STORED); + textType.setStoreTermVectors(false); + + int id = 1; + for (TextDocument document : documents) { + org.apache.lucene.document.Document luceneDocument = new org.apache.lucene.document.Document(); + luceneDocument.add(new Field("uri", Integer.toString(id++), stringType)); + luceneDocument.add(new Field(searchField, document.getContent(), textType)); + luceneDocuments.add(luceneDocument); + } + writer.addDocuments(luceneDocuments); + + System.out.println("Done."); + writer.close(); + + return new LuceneSyntacticIndex(indexDirectory, searchField); + } + public static SyntacticIndex loadIndex(File indexDirectory) throws Exception { return new LuceneSyntacticIndex(new SimpleFSDirectory(indexDirectory), searchField); } public static void main(String[] args) throws Exception { if (args.length != 2) { - System.err.println("Usage: <input director> <index directory>"); + System.err.println("Usage: <input directory> <index directory>"); System.exit(1); return; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-11-21 12:25:43
|
Revision: 4156 http://sourceforge.net/p/dl-learner/code/4156 Author: lorenz_b Date: 2013-11-21 12:25:40 +0000 (Thu, 21 Nov 2013) Log Message: ----------- Added token class. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/WordTypeComparator.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-11-21 11:54:23 UTC (rev 4155) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-11-21 12:25:40 UTC (rev 4156) @@ -29,7 +29,7 @@ } private void buildCleanedContent(){ - this.content = content.toLowerCase(); + this.content = rawContent.toLowerCase(); this.content = this.content.replaceAll("[^a-z ]", " "); this.content = this.content.replaceAll("\\s{2,}", " "); this.content = this.content.trim(); Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-11-21 12:25:40 UTC (rev 4156) @@ -0,0 +1,64 @@ +/** + * + */ +package org.dllearner.algorithms.isle.index; + +/** + * @author Lorenz Buehmann + * + */ +public class Token { + + private String rawForm; + private String stemmedForm; + private String posTag; + + public Token(String rawForm) { + posTag = rawForm; + } + + /** + * @return the rawForm + */ + public String getRawForm() { + return rawForm; + } + + /** + * @return the stemmedForm + */ + public String getStemmedForm() { + return stemmedForm; + } + + /** + * @return the posTag + */ + public String getPOSTag() { + return posTag; + } + + /** + * @param stemmedForm the stemmedForm to set + */ + public void setStemmedForm(String stemmedForm) { + this.stemmedForm = stemmedForm; + } + + /** + * @param posTag the posTag to set + */ + public void setPOSTag(String posTag) { + this.posTag = posTag; + } + + /* (non-Javadoc) + * @see java.lang.Object#toString() + */ + @Override + public String toString() { + return "Word: " + rawForm + "\n" + + "Stemmed word: " + stemmedForm + "\n" + + "POS tag: " + posTag; + } +} Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/WordTypeComparator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/WordTypeComparator.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/WordTypeComparator.java 2013-11-21 12:25:40 UTC (rev 4156) @@ -0,0 +1,27 @@ +/** + * + */ +package org.dllearner.algorithms.isle.index; + +/** + * Compare the word types of two given words. + * @author Lorenz Buehmann + * + */ +public class WordTypeComparator { + + /** + * Returns TRUE if both POS tags are related to the same word type, i.e. whether both are NOUNS, VERBS, etc. , + * else FALSE is returned. + * @param posTag1 the POS tag of the first word + * @param posTag2 the POS tag of the second word + * @return + */ + public static boolean sameWordType(String posTag1, String posTag2){ + if(posTag1.startsWith("NN") && posTag2.startsWith("NN") || + posTag1.startsWith("V") && posTag2.startsWith("V")){ + return true; + } + return false; + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-11-21 12:51:08
|
Revision: 4160 http://sourceforge.net/p/dl-learner/code/4160 Author: dfleischhacker Date: 2013-11-21 12:51:05 +0000 (Thu, 21 Nov 2013) Log Message: ----------- Move enum SurfaceFormLevel to own class Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java Added Paths: ----------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SurfaceFormLevel.java Added: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SurfaceFormLevel.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SurfaceFormLevel.java (rev 0) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SurfaceFormLevel.java 2013-11-21 12:51:05 UTC (rev 4160) @@ -0,0 +1,13 @@ +package org.dllearner.algorithms.isle.index; + +/** + * Different levels of surface forms supported by the {@link TextDocument} class. Used for retrieving certain types + * of texts. + * + * @author Daniel Fleischhacker + */ +public enum SurfaceFormLevel { + RAW, + POS_TAGGED, + STEMMED +} \ No newline at end of file Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-11-21 12:44:09 UTC (rev 4159) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TextDocument.java 2013-11-21 12:51:05 UTC (rev 4160) @@ -10,25 +10,19 @@ public class TextDocument extends LinkedList<Token> implements Document { @Override public String getContent() { - return getContentStartingAtToken(this.getFirst(), Level.STEMMED); + return getContentStartingAtToken(this.getFirst(), SurfaceFormLevel.STEMMED); } @Override public String getRawContent() { - return getContentStartingAtToken(this.getFirst(), Level.RAW); + return getContentStartingAtToken(this.getFirst(), SurfaceFormLevel.RAW); } @Override public String getPOSTaggedContent() { - return getContentStartingAtToken(this.getFirst(), Level.POS_TAGGED); + return getContentStartingAtToken(this.getFirst(), SurfaceFormLevel.POS_TAGGED); } - public static enum Level { - RAW, - POS_TAGGED, - STEMMED - } - /** * Returns a string containing all tokens starting at the token {@code start} until the end of the list. The * surface forms according to {@code level} are used to build the string. @@ -37,7 +31,7 @@ * @param l level of surface forms to use * @return built string */ - public String getContentStartingAtToken(Token start, Level l) { + public String getContentStartingAtToken(Token start, SurfaceFormLevel l) { StringBuilder sb = new StringBuilder(); boolean found = false; for (Token t : this) { @@ -54,7 +48,7 @@ return sb.toString(); } - private String getStringForLevel(Token t, Level l) { + private String getStringForLevel(Token t, SurfaceFormLevel l) { switch (l) { case RAW: return t.getRawForm(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-11-21 15:24:35
|
Revision: 4170 http://sourceforge.net/p/dl-learner/code/4170 Author: dfleischhacker Date: 2013-11-21 15:24:32 +0000 (Thu, 21 Nov 2013) Log Message: ----------- Remove unused offset-based methods Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedDocument.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedTextDocument.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedDocument.java 2013-11-21 13:53:20 UTC (rev 4169) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedDocument.java 2013-11-21 15:24:32 UTC (rev 4170) @@ -3,10 +3,10 @@ */ package org.dllearner.algorithms.isle.index; +import org.dllearner.core.owl.Entity; + import java.util.Set; -import org.dllearner.core.owl.Entity; - /** * @author Lorenz Buehmann * @@ -25,14 +25,6 @@ */ Set<SemanticAnnotation> getAnnotations(); - /** - * Returns the annotation at the given position(offset) of given length. - * @param offset - * @param length - * @return - */ - SemanticAnnotation getAnnotation(int offset, int length); - /** * Returns the number of occurrences of the given entity in this document. * Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedTextDocument.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedTextDocument.java 2013-11-21 13:53:20 UTC (rev 4169) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/AnnotatedTextDocument.java 2013-11-21 15:24:32 UTC (rev 4170) @@ -3,11 +3,11 @@ */ package org.dllearner.algorithms.isle.index; +import org.dllearner.core.owl.Entity; + import java.util.HashSet; import java.util.Set; -import org.dllearner.core.owl.Entity; - /** * @author Lorenz Buehmann * @@ -70,19 +70,6 @@ } /* (non-Javadoc) - * @see org.dllearner.algorithms.isle.index.AnnotatedDocument#getAnnotation(int, int) - */ - @Override - public SemanticAnnotation getAnnotation(int offset, int length) { - for (SemanticAnnotation annotation : annotations) { - if(annotation.getOffset() == offset && annotation.getLength() == length){ - return annotation; - } - } - return null; - } - - /* (non-Javadoc) * @see org.dllearner.algorithms.isle.index.AnnotatedDocument#getEntityFrequency(org.dllearner.core.owl.Entity) */ @Override This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-11-25 09:47:39
|
Revision: 4174 http://sourceforge.net/p/dl-learner/code/4174 Author: lorenz_b Date: 2013-11-25 09:47:35 +0000 (Mon, 25 Nov 2013) Log Message: ----------- ISLE. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NormalizedTextMapper.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NormalizedTextMapper.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NormalizedTextMapper.java 2013-11-25 09:42:56 UTC (rev 4173) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NormalizedTextMapper.java 2013-11-25 09:47:35 UTC (rev 4174) @@ -60,6 +60,7 @@ curNormalizedLength += p.getNormalizedLength(); curOriginalLength += p.getOriginalLength(); if (curNormalizedLength >= length) { + //TODO refactoring // return new Annotation(originalDocument, originalStart, curOriginalLength); } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-11-25 09:42:56 UTC (rev 4173) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-11-25 09:47:35 UTC (rev 4174) @@ -1,6 +1,8 @@ package org.dllearner.algorithms.isle.index; +import com.google.common.collect.Lists; import com.google.common.collect.Sets; + import org.dllearner.algorithms.isle.EntityCandidateGenerator; import org.dllearner.algorithms.isle.StopWordFilter; import org.dllearner.core.owl.Entity; @@ -39,6 +41,7 @@ public void postProcess(HashMap<Annotation,Set<Entity>> candidatesMap, int window, StopWordFilter stopWordFilter) { Set<Annotation> annotations = candidatesMap.keySet(); List<Annotation> sortedAnnotations = new ArrayList<Annotation>(annotations); + //TODO refactoring /** @@ -108,17 +111,10 @@ } private Annotation mergeAnnotations(Annotation annotation_i, Annotation annotation_j) { - return null; -// int offset; -// int length; -// if (annotation_i.getOffset() < annotation_j.getOffset()) { -// offset = annotation_i.getOffset(); -// length = annotation_j.getOffset() - offset + annotation_j.getLength(); -// } else { -// offset = annotation_j.getOffset(); -// length = annotation_i.getOffset() - offset + annotation_i.getLength(); -// } -// return new Annotation(annotation_i.getReferencedDocument(), offset, length); + List<Token> tokens = Lists.newArrayList(); + tokens.addAll(annotation_i.getTokens()); + tokens.addAll(annotation_j.getTokens()); + return new Annotation(annotation_i.getReferencedDocument(), tokens); } @Override This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-12-02 14:59:39
|
Revision: 4184 http://sourceforge.net/p/dl-learner/code/4184 Author: dfleischhacker Date: 2013-12-02 14:59:36 +0000 (Mon, 02 Dec 2013) Log Message: ----------- Adapt to new Token implementation Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java 2013-12-02 14:52:33 UTC (rev 4183) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/EntityCandidatesTrie.java 2013-12-02 14:59:36 UTC (rev 4184) @@ -2,6 +2,7 @@ import org.dllearner.core.owl.Entity; +import java.util.List; import java.util.Set; public interface EntityCandidatesTrie { @@ -11,7 +12,7 @@ * @param s * @param e */ - public void addEntry(String s, Entity e); + public void addEntry(List<Token> s, Entity e); /** Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-02 14:52:33 UTC (rev 4183) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-02 14:59:36 UTC (rev 4184) @@ -3,7 +3,6 @@ import org.apache.commons.lang.StringUtils; import org.dllearner.algorithms.isle.textretrieval.EntityTextRetriever; import org.dllearner.core.owl.Entity; -import org.dllearner.utilities.MapUtils; import org.dllearner.utilities.datastructures.PrefixTrie; import org.semanticweb.owlapi.model.OWLOntology; @@ -11,7 +10,7 @@ import java.util.Map.Entry; public class SimpleEntityCandidatesTrie implements EntityCandidatesTrie { - + TokenTree tree; PrefixTrie<FullTokenEntitySetPair> trie; EntityTextRetriever entityTextRetriever; @@ -41,7 +40,7 @@ } public void buildTrie(OWLOntology ontology, NameGenerator nameGenerator) { - this.trie = new PrefixTrie<FullTokenEntitySetPair>(); + this.tree = new TokenTree(); Map<Entity, Set<List<Token>>> entity2TokenSet = entityTextRetriever.getRelevantText(ontology); @@ -62,51 +61,45 @@ /** * Adds the subsequences of a test * @param entity - * @param text + * @param tokens */ - private void addSubsequences(Entity entity, String text) { - if (text.contains(" ")) { - String[] tokens = text.split(" "); - for (int size=1; size<tokens.length; size++) { - - for (int start=0; start<tokens.length-size+1; start++) { - String subsequence = ""; - for (int i=0; i<size; i++) { - subsequence += tokens[start+i] + " "; - } - subsequence = subsequence.trim(); - - addEntry(subsequence, entity); - } - - } - } - } - - private void addSubsequencesWordNet(Entity entity, String text) { - if (text.contains(" ")) { - String[] tokens = text.split(" "); - - List<String>[] wordnetTokens = (ArrayList<String>[]) new ArrayList[tokens.length]; - - // generate list of lemmatized wordnet synonyms for each token - for (int i = 0; i < tokens.length; i++) { - wordnetTokens[i] = new ArrayList<String>(); - wordnetTokens[i].add(LinguisticUtil.getInstance().getNormalizedForm(tokens[i].toLowerCase())); - for (String w : LinguisticUtil.getInstance().getTopSynonymsForWord(tokens[i], 5)) { - wordnetTokens[i].add(LinguisticUtil.getInstance().getNormalizedForm(w).toLowerCase()); + private void addSubsequences(Entity entity, List<Token> tokens) { + tree.add(tokens, entity); + for (int size = 1; size < tokens.size(); size++) { + for (int start = 0; start < tokens.size() - size + 1; start++) { + ArrayList<Token> subsequence = new ArrayList<>(); + for (int i = 0; i < size; i++) { + subsequence.add(tokens.get(start + i)); } + addEntry(subsequence, entity); } - - // generate subsequences starting at the given start index of the given size - Set<String[]> allPossibleSubsequences = getAllPossibleSubsequences(tokens, wordnetTokens); - - for (String[] s : allPossibleSubsequences) { - addEntry(s[0], entity, s[1]); - } } } +// private void addSubsequencesWordNet(Entity entity, String text) { +// if (text.contains(" ")) { +// String[] tokens = text.split(" "); +// +// List<String>[] wordnetTokens = (ArrayList<String>[]) new ArrayList[tokens.length]; +// +// // generate list of lemmatized wordnet synonyms for each token +// for (int i = 0; i < tokens.length; i++) { +// wordnetTokens[i] = new ArrayList<String>(); +// wordnetTokens[i].add(LinguisticUtil.getInstance().getNormalizedForm(tokens[i].toLowerCase())); +// for (String w : LinguisticUtil.getInstance().getTopSynonymsForWord(tokens[i], 5)) { +// wordnetTokens[i].add(LinguisticUtil.getInstance().getNormalizedForm(w).toLowerCase()); +// } +// } +// +// // generate subsequences starting at the given start index of the given size +// Set<String[]> allPossibleSubsequences = getAllPossibleSubsequences(tokens, wordnetTokens); +// +// for (String[] s : allPossibleSubsequences) { +// addEntry(s[0], entity, s[1]); +// } +// } +// } + private static Set<String[]> getAllPossibleSubsequences(String[] originalTokens, List<String>[] wordnetTokens) { ArrayList<String[]> res = new ArrayList<String[]>(); @@ -143,30 +136,12 @@ } @Override - public void addEntry(String s, Entity e) { - s = s.trim(); - FullTokenEntitySetPair candidates; - if (trie.contains(s)) - candidates = trie.get(s); - else - candidates = new FullTokenEntitySetPair(s); - - candidates.addEntity(e); - - trie.put(s, candidates); + public void addEntry(List<Token> s, Entity e) { + tree.add(s, e); } - public void addEntry(String s, Entity e, String originalString) { - s = s.trim(); - FullTokenEntitySetPair candidates; - if (trie.contains(s)) - candidates = trie.get(s); - else - candidates = new FullTokenEntitySetPair(originalString); - - candidates.addEntity(e); - - trie.put(s, candidates); + public void addEntry(List<Token> s, Entity e, List<Token> originalTokens) { + tree.add(s, e, originalTokens); } @Override Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-02 14:52:33 UTC (rev 4183) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-02 14:59:36 UTC (rev 4184) @@ -15,10 +15,12 @@ public class TokenTree { private HashMap<Token, TokenTree> children; private Set<Entity> entities; + private List<Token> originalTokens; public TokenTree() { this.children = new HashMap<>(); this.entities = new HashSet<>(); + this.originalTokens = new ArrayList<>(); } /** @@ -27,7 +29,7 @@ * @param tokens tokens to locate insertion point for entities * @param entities entities to add */ - public void add(List<Token> tokens, Set<Entity> entities) { + public void add(List<Token> tokens, Set<Entity> entities, List<Token> originalTokens) { TokenTree curNode = this; for (Token t : tokens) { TokenTree nextNode = curNode.children.get(t); @@ -38,8 +40,13 @@ curNode = nextNode; } curNode.entities.addAll(entities); + curNode.originalTokens = new ArrayList<>(originalTokens); } + public void add(List<Token> tokens, Set<Entity> entities) { + add(tokens, entities, tokens); + } + /** * Adds the given entity to the tree. * @@ -50,6 +57,10 @@ add(tokens, Collections.singleton(entity)); } + public void add(List<Token> tokens, Entity entity, List<Token> originalTokens) { + add(tokens, Collections.singleton(entity), originalTokens); + } + /** * Returns the set of entities located by the given list of tokens. * @@ -112,6 +123,27 @@ return fallback == null ? Collections.<Entity>emptySet() : fallback.entities; } + /** + * Returns the original token for the longest match + */ + public List<Token> getOriginalTokensForLongestMatch(List<Token> tokens) { + TokenTree fallback = this.entities.isEmpty() ? null : this; + TokenTree curNode = this; + + for (Token t : tokens) { + TokenTree nextNode = curNode.children.get(t); + if (nextNode == null) { + return fallback == null ? null : fallback.originalTokens; + } + curNode = nextNode; + if (!curNode.entities.isEmpty()) { + fallback = curNode; + } + } + + return fallback == null ? Collections.<Token>emptyList() : fallback.originalTokens; + } + public static void main(String[] args) throws Exception { List<Token> tokens1 = Lists.newLinkedList(); for (String s : Splitter.on(" ").split("this is a token tree")) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <lor...@us...> - 2013-12-02 15:43:51
|
Revision: 4188 http://sourceforge.net/p/dl-learner/code/4188 Author: lorenz_b Date: 2013-12-02 15:43:48 +0000 (Mon, 02 Dec 2013) Log Message: ----------- Refactoring ISLE. Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-02 15:22:04 UTC (rev 4187) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-02 15:43:48 UTC (rev 4188) @@ -160,18 +160,7 @@ } public String toString() { - StringBuilder output = new StringBuilder(); - Map<String,FullTokenEntitySetPair> trieMap = trie.toMap(); - - for (Entry<String, FullTokenEntitySetPair> entry : trieMap.entrySet()) { - String key = entry.getKey(); - FullTokenEntitySetPair pair = entry.getValue(); - output.append(key + " (" + pair.getFullToken() + ") :\n"); - for (Entity candidate: pair.getEntitySet()) { - output.append("\t"+candidate+"\n"); - } - } - return output.toString(); + return tree.toString(); } public static void main(String[] args) { Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-02 15:22:04 UTC (rev 4187) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TokenTree.java 2013-12-02 15:43:48 UTC (rev 4188) @@ -16,12 +16,21 @@ private HashMap<Token, TokenTree> children; private Set<Entity> entities; private List<Token> originalTokens; + private boolean ignoreStopWords = true; public TokenTree() { this.children = new HashMap<>(); this.entities = new HashSet<>(); this.originalTokens = new ArrayList<>(); } + + /** + * If set to TRUE, stopwords like 'of, on' are ignored during creation and retrieval operations. + * @param ignoreStopWords the ignoreStopWords to set + */ + public void setIgnoreStopWords(boolean ignoreStopWords) { + this.ignoreStopWords = ignoreStopWords; + } /** * Adds all given entities to the end of the path resulting from the given tokens. @@ -32,12 +41,16 @@ public void add(List<Token> tokens, Set<Entity> entities, List<Token> originalTokens) { TokenTree curNode = this; for (Token t : tokens) { - TokenTree nextNode = curNode.children.get(t); - if (nextNode == null) { - nextNode = new TokenTree(); - curNode.children.put(t, nextNode); - } - curNode = nextNode; + if(!ignoreStopWords || (ignoreStopWords && !t.isStopWord())){ + TokenTree nextNode = curNode.children.get(t); + if (nextNode == null) { + nextNode = new TokenTree(); + curNode.children.put(t, nextNode); + } + curNode = nextNode; + } else { + System.out.println("ignored " + t); + } } curNode.entities.addAll(entities); curNode.originalTokens = new ArrayList<>(originalTokens); Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-12-02 15:22:04 UTC (rev 4187) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieLinguisticAnnotator.java 2013-12-02 15:43:48 UTC (rev 4188) @@ -27,14 +27,14 @@ @Override public Set<Annotation> annotate(TextDocument document) { Set<Annotation> annotations = new HashSet<Annotation>(); - NormalizedTextMapper mapper = new NormalizedTextMapper(document); - String content = mapper.getNormalizedText(); List<Token> matchedTokens; for (Token token : document) { matchedTokens = candidatesTrie.getLongestMatchingText(document.getTokensStartingAtToken(token, true)); - Annotation annotation = new Annotation(document, matchedTokens); - annotations.add(annotation); + if(matchedTokens != null && !matchedTokens.isEmpty()){ + Annotation annotation = new Annotation(document, matchedTokens); + annotations.add(annotation); + } } return annotations; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <dfl...@us...> - 2013-12-09 15:37:42
|
Revision: 4199 http://sourceforge.net/p/dl-learner/code/4199 Author: dfleischhacker Date: 2013-12-09 15:37:39 +0000 (Mon, 09 Dec 2013) Log Message: ----------- Cleanup and show alternative names Modified Paths: -------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java Removed Paths: ------------- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/FullTokenEntitySetPair.java trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NormalizedTextMapper.java Deleted: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/FullTokenEntitySetPair.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/FullTokenEntitySetPair.java 2013-12-09 15:36:38 UTC (rev 4198) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/FullTokenEntitySetPair.java 2013-12-09 15:37:39 UTC (rev 4199) @@ -1,31 +0,0 @@ -package org.dllearner.algorithms.isle.index; - -import org.dllearner.core.owl.Entity; - -import java.util.HashSet; -import java.util.Set; - -/** - * A pair consisting of a full string token and the corresponding entities - */ -public class FullTokenEntitySetPair { - private String fullToken; - private Set<Entity> entitySet; - - public FullTokenEntitySetPair(String fullToken) { - this.fullToken = fullToken; - this.entitySet = new HashSet<Entity>(); - } - - public String getFullToken() { - return fullToken; - } - - public Set<Entity> getEntitySet() { - return entitySet; - } - - public void addEntity(Entity entity) { - entitySet.add(entity); - } -} Deleted: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NormalizedTextMapper.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NormalizedTextMapper.java 2013-12-09 15:36:38 UTC (rev 4198) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/NormalizedTextMapper.java 2013-12-09 15:37:39 UTC (rev 4199) @@ -1,141 +0,0 @@ -package org.dllearner.algorithms.isle.index; - -import java.util.ArrayList; - -/** - * Provides text normalization and mapping of normalized ranges to the original ones. - */ -public class NormalizedTextMapper { - private Document originalDocument; - private String originalText; - private String normalizedText; - - private ArrayList<OccurenceMappingPair> normalizedIndexToOriginalIndex; - - public NormalizedTextMapper(Document original) { - this.originalDocument = original; - this.originalText = original.getContent(); - this.normalizedIndexToOriginalIndex = new ArrayList<OccurenceMappingPair>(); - - StringBuilder sb = new StringBuilder(); - int currentOriginalIndex = 0; - for (String originalWord : originalText.split(" ")) { - String normalizedWord = getNormalizedWord(originalWord); - normalizedIndexToOriginalIndex - .add(new OccurenceMappingPair(currentOriginalIndex, originalWord.length(), sb.length(), - normalizedWord.length())); - currentOriginalIndex += originalWord.length() + 1; - sb.append(normalizedWord); - sb.append(" "); - } - normalizedText = sb.toString(); - } - - public String getOriginalText() { - return originalText; - } - - public String getNormalizedText() { - return normalizedText; - } - - /** - * Returns the annotation for the original text matching the given position and length in the normalized - * text. - * - * @param position position in the normalized text to get annotation for - * @param length length of the text to get annotation for - * @return - */ - public Annotation getOriginalAnnotationForPosition(int position, int length) { - int curNormalizedLength = 0; - int originalStart = -1; - int curOriginalLength = 0; - - for (OccurenceMappingPair p : normalizedIndexToOriginalIndex) { - if (p.getNormalizedIndex() == position) { - originalStart = p.getOriginalIndex(); - } - if (originalStart != -1) { - curNormalizedLength += p.getNormalizedLength(); - curOriginalLength += p.getOriginalLength(); - if (curNormalizedLength >= length) { - //TODO refactoring -// return new Annotation(originalDocument, originalStart, curOriginalLength); - } - - // include space - curNormalizedLength += 1; - curOriginalLength += 1; - } - } - - return null; - } - - /** - * Returns the normalized form of the given word. Word must not contain any spaces or the like. - * @param word - * @return - */ - private String getNormalizedWord(String word) { - return LinguisticUtil.getInstance().getNormalizedForm(word); - } - - public static void main(String[] args) { -// NormalizedTextMapper n = new NormalizedTextMapper(new TextDocument("This is a testing text using letters")); -// System.out.println(n.getOriginalText()); -// System.out.println(n.getNormalizedText()); -// for (OccurenceMappingPair p : n.normalizedIndexToOriginalIndex) { -// System.out.println(p); -// } -// System.out.println(n.getOriginalAnnotationForPosition(7,6)); -// System.out.println(n.getOriginalAnnotationForPosition(23,6)); -// System.out.println(n.getOriginalAnnotationForPosition(7,1)); -// System.out.println(n.getOriginalAnnotationForPosition(14,15)); - } - - /** - * Maps words identified by index and length in the normalized texts to the original word. - */ - private class OccurenceMappingPair { - private int originalIndex; - private int originalLength; - private int normalizedIndex; - private int normalizedLength; - - private OccurenceMappingPair(int originalIndex, int originalLength, int normalizedIndex, int normalizedLength) { - - this.originalIndex = originalIndex; - this.originalLength = originalLength; - this.normalizedIndex = normalizedIndex; - this.normalizedLength = normalizedLength; - } - - private int getNormalizedIndex() { - return normalizedIndex; - } - - private int getNormalizedLength() { - return normalizedLength; - } - - private int getOriginalLength() { - return originalLength; - } - - private int getOriginalIndex() { - return originalIndex; - } - - @Override - public String toString() { - return "OccurenceMappingPair{" + - "originalIndex=" + originalIndex + - ", originalLength=" + originalLength + - ", normalizedIndex=" + normalizedIndex + - ", normalizedLength=" + normalizedLength + - '}'; - } - } -} Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-09 15:36:38 UTC (rev 4198) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/SimpleEntityCandidatesTrie.java 2013-12-09 15:37:39 UTC (rev 4199) @@ -91,6 +91,10 @@ String[] synonyms = LinguisticUtil.getInstance().getSynonymsForWord(t.getRawForm(), wordnetPos); for (String synonym : synonyms) { + // ignore all multi word synonyms + if (synonym.contains("_")) { + continue; + } t.addAlternativeForm(LinguisticUtil.getInstance().getNormalizedForm(synonym)); } } Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-09 15:36:38 UTC (rev 4198) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/Token.java 2013-12-09 15:37:39 UTC (rev 4199) @@ -138,7 +138,7 @@ */ @Override public String toString() { - return "[Word: " + rawForm + " | Stemmed word: " + stemmedForm + " | POS tag: " + posTag + "]"; + return "[Word: " + rawForm + " | Stemmed word: " + stemmedForm + " | POS tag: " + posTag + " | Alternatives: " + alternativeForms.toString() + "]"; } /** Modified: trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java =================================================================== --- trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-12-09 15:36:38 UTC (rev 4198) +++ trunk/components-core/src/main/java/org/dllearner/algorithms/isle/index/TrieEntityCandidateGenerator.java 2013-12-09 15:37:39 UTC (rev 4199) @@ -1,15 +1,15 @@ package org.dllearner.algorithms.isle.index; import com.google.common.collect.Lists; -import com.google.common.collect.Sets; - import org.dllearner.algorithms.isle.EntityCandidateGenerator; import org.dllearner.algorithms.isle.StopWordFilter; import org.dllearner.core.owl.Entity; import org.semanticweb.owlapi.model.OWLOntology; -import java.util.*; -import java.util.regex.Pattern; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Set; /** * Generates candidates using a entity candidates prefix trie @@ -34,7 +34,6 @@ /** * Postprocess the annotations generated by annotate * The objective is to merge annotations which are likely to belong to the same entity - * @param annotations : set of annotations * @param window : maximum distance between the annotations * @return */ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |